| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
- */
- #include <asm/asm.h>
- #include <asm/regdef.h>
- #include <linux/linkage.h>
- .text
- .macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
- \op \d0, \d0, \s0
- \op \d1, \d1, \s1
- \op \d2, \d2, \s2
- \op \d3, \d3, \s3
- .endm
- /*
- * Very basic LoongArch implementation of ChaCha20. Produces a given positive
- * number of blocks of output with a nonce of 0, taking an input key and
- * 8-byte counter. Importantly does not spill to the stack. Its arguments
- * are:
- *
- * a0: output bytes
- * a1: 32-byte key input
- * a2: 8-byte counter input/output
- * a3: number of 64-byte blocks to write to output
- */
- SYM_FUNC_START(__arch_chacha20_blocks_nostack)
- /* We don't need a frame pointer */
- #define s9 fp
- #define output a0
- #define key a1
- #define counter a2
- #define nblocks a3
- #define i a4
- #define state0 s0
- #define state1 s1
- #define state2 s2
- #define state3 s3
- #define state4 s4
- #define state5 s5
- #define state6 s6
- #define state7 s7
- #define state8 s8
- #define state9 s9
- #define state10 a5
- #define state11 a6
- #define state12 a7
- #define state13 t0
- #define state14 t1
- #define state15 t2
- #define cnt_lo t3
- #define cnt_hi t4
- #define copy0 t5
- #define copy1 t6
- #define copy2 t7
- /* Reuse i as copy3 */
- #define copy3 i
- /* Packs to be used with OP_4REG */
- #define line0 state0, state1, state2, state3
- #define line1 state4, state5, state6, state7
- #define line2 state8, state9, state10, state11
- #define line3 state12, state13, state14, state15
- #define line1_perm state5, state6, state7, state4
- #define line2_perm state10, state11, state8, state9
- #define line3_perm state15, state12, state13, state14
- #define copy copy0, copy1, copy2, copy3
- #define _16 16, 16, 16, 16
- #define _20 20, 20, 20, 20
- #define _24 24, 24, 24, 24
- #define _25 25, 25, 25, 25
- /*
- * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
- * This does not violate the stack-less requirement: no sensitive data
- * is spilled onto the stack.
- */
- PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN
- REG_S s0, sp, 0
- REG_S s1, sp, SZREG
- REG_S s2, sp, SZREG * 2
- REG_S s3, sp, SZREG * 3
- REG_S s4, sp, SZREG * 4
- REG_S s5, sp, SZREG * 5
- REG_S s6, sp, SZREG * 6
- REG_S s7, sp, SZREG * 7
- REG_S s8, sp, SZREG * 8
- REG_S s9, sp, SZREG * 9
- li.w copy0, 0x61707865
- li.w copy1, 0x3320646e
- li.w copy2, 0x79622d32
- ld.w cnt_lo, counter, 0
- ld.w cnt_hi, counter, 4
- .Lblock:
- /* state[0,1,2,3] = "expand 32-byte k" */
- move state0, copy0
- move state1, copy1
- move state2, copy2
- li.w state3, 0x6b206574
- /* state[4,5,..,11] = key */
- ld.w state4, key, 0
- ld.w state5, key, 4
- ld.w state6, key, 8
- ld.w state7, key, 12
- ld.w state8, key, 16
- ld.w state9, key, 20
- ld.w state10, key, 24
- ld.w state11, key, 28
- /* state[12,13] = counter */
- move state12, cnt_lo
- move state13, cnt_hi
- /* state[14,15] = 0 */
- move state14, zero
- move state15, zero
- li.w i, 10
- .Lpermute:
- /* odd round */
- OP_4REG add.w line0, line1
- OP_4REG xor line3, line0
- OP_4REG rotri.w line3, _16
- OP_4REG add.w line2, line3
- OP_4REG xor line1, line2
- OP_4REG rotri.w line1, _20
- OP_4REG add.w line0, line1
- OP_4REG xor line3, line0
- OP_4REG rotri.w line3, _24
- OP_4REG add.w line2, line3
- OP_4REG xor line1, line2
- OP_4REG rotri.w line1, _25
- /* even round */
- OP_4REG add.w line0, line1_perm
- OP_4REG xor line3_perm, line0
- OP_4REG rotri.w line3_perm, _16
- OP_4REG add.w line2_perm, line3_perm
- OP_4REG xor line1_perm, line2_perm
- OP_4REG rotri.w line1_perm, _20
- OP_4REG add.w line0, line1_perm
- OP_4REG xor line3_perm, line0
- OP_4REG rotri.w line3_perm, _24
- OP_4REG add.w line2_perm, line3_perm
- OP_4REG xor line1_perm, line2_perm
- OP_4REG rotri.w line1_perm, _25
- addi.w i, i, -1
- bnez i, .Lpermute
- /*
- * copy[3] = "expa", materialize it here because copy[3] shares the
- * same register with i which just became dead.
- */
- li.w copy3, 0x6b206574
- /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
- OP_4REG add.w line0, copy
- st.w state0, output, 0
- st.w state1, output, 4
- st.w state2, output, 8
- st.w state3, output, 12
- /* from now on state[0,1,2,3] are scratch registers */
- /* state[0,1,2,3] = lo32(key) */
- ld.w state0, key, 0
- ld.w state1, key, 4
- ld.w state2, key, 8
- ld.w state3, key, 12
- /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
- OP_4REG add.w line1, line0
- st.w state4, output, 16
- st.w state5, output, 20
- st.w state6, output, 24
- st.w state7, output, 28
- /* state[0,1,2,3] = hi32(key) */
- ld.w state0, key, 16
- ld.w state1, key, 20
- ld.w state2, key, 24
- ld.w state3, key, 28
- /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
- OP_4REG add.w line2, line0
- st.w state8, output, 32
- st.w state9, output, 36
- st.w state10, output, 40
- st.w state11, output, 44
- /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
- add.w state12, state12, cnt_lo
- add.w state13, state13, cnt_hi
- st.w state12, output, 48
- st.w state13, output, 52
- st.w state14, output, 56
- st.w state15, output, 60
- /* ++counter */
- addi.w cnt_lo, cnt_lo, 1
- sltui state0, cnt_lo, 1
- add.w cnt_hi, cnt_hi, state0
- /* output += 64 */
- PTR_ADDI output, output, 64
- /* --nblocks */
- PTR_ADDI nblocks, nblocks, -1
- bnez nblocks, .Lblock
- /* counter = [cnt_lo, cnt_hi] */
- st.w cnt_lo, counter, 0
- st.w cnt_hi, counter, 4
- /*
- * Zero out the potentially sensitive regs, in case nothing uses these
- * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
- * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
- * only need to zero state[11,...,15].
- */
- move state10, zero
- move state11, zero
- move state12, zero
- move state13, zero
- move state14, zero
- move state15, zero
- REG_L s0, sp, 0
- REG_L s1, sp, SZREG
- REG_L s2, sp, SZREG * 2
- REG_L s3, sp, SZREG * 3
- REG_L s4, sp, SZREG * 4
- REG_L s5, sp, SZREG * 5
- REG_L s6, sp, SZREG * 6
- REG_L s7, sp, SZREG * 7
- REG_L s8, sp, SZREG * 8
- REG_L s9, sp, SZREG * 9
- PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN)
- jr ra
- SYM_FUNC_END(__arch_chacha20_blocks_nostack)
|