sm4-ce-core.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
  4. * as specified in
  5. * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
  6. *
  7. * Copyright (C) 2022, Alibaba Group.
  8. * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. #include "sm4-ce-asm.h"
  13. .arch armv8-a+crypto
  14. .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
  15. 20, 24, 25, 26, 27, 28, 29, 30, 31
  16. .set .Lv\b\().4s, \b
  17. .endr
  18. .macro sm4e, vd, vn
  19. .inst 0xcec08400 | (.L\vn << 5) | .L\vd
  20. .endm
  21. .macro sm4ekey, vd, vn, vm
  22. .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
  23. .endm
  24. /* Register macros */
  25. #define RTMP0 v16
  26. #define RTMP1 v17
  27. #define RTMP2 v18
  28. #define RTMP3 v19
  29. #define RIV v20
  30. #define RMAC v20
  31. #define RMASK v21
  32. .align 3
  33. SYM_FUNC_START(sm4_ce_expand_key)
  34. /* input:
  35. * x0: 128-bit key
  36. * x1: rkey_enc
  37. * x2: rkey_dec
  38. * x3: fk array
  39. * x4: ck array
  40. */
  41. ld1 {v0.16b}, [x0];
  42. rev32 v0.16b, v0.16b;
  43. ld1 {v1.16b}, [x3];
  44. /* load ck */
  45. ld1 {v24.16b-v27.16b}, [x4], #64;
  46. ld1 {v28.16b-v31.16b}, [x4];
  47. /* input ^ fk */
  48. eor v0.16b, v0.16b, v1.16b;
  49. sm4ekey v0.4s, v0.4s, v24.4s;
  50. sm4ekey v1.4s, v0.4s, v25.4s;
  51. sm4ekey v2.4s, v1.4s, v26.4s;
  52. sm4ekey v3.4s, v2.4s, v27.4s;
  53. sm4ekey v4.4s, v3.4s, v28.4s;
  54. sm4ekey v5.4s, v4.4s, v29.4s;
  55. sm4ekey v6.4s, v5.4s, v30.4s;
  56. sm4ekey v7.4s, v6.4s, v31.4s;
  57. adr_l x5, .Lbswap128_mask
  58. ld1 {v24.16b}, [x5]
  59. st1 {v0.16b-v3.16b}, [x1], #64;
  60. st1 {v4.16b-v7.16b}, [x1];
  61. tbl v16.16b, {v7.16b}, v24.16b
  62. tbl v17.16b, {v6.16b}, v24.16b
  63. tbl v18.16b, {v5.16b}, v24.16b
  64. tbl v19.16b, {v4.16b}, v24.16b
  65. tbl v20.16b, {v3.16b}, v24.16b
  66. tbl v21.16b, {v2.16b}, v24.16b
  67. tbl v22.16b, {v1.16b}, v24.16b
  68. tbl v23.16b, {v0.16b}, v24.16b
  69. st1 {v16.16b-v19.16b}, [x2], #64
  70. st1 {v20.16b-v23.16b}, [x2]
  71. ret;
  72. SYM_FUNC_END(sm4_ce_expand_key)
  73. .align 3
  74. SYM_FUNC_START(sm4_ce_crypt_block)
  75. /* input:
  76. * x0: round key array, CTX
  77. * x1: dst
  78. * x2: src
  79. */
  80. SM4_PREPARE(x0)
  81. ld1 {v0.16b}, [x2];
  82. SM4_CRYPT_BLK(v0);
  83. st1 {v0.16b}, [x1];
  84. ret;
  85. SYM_FUNC_END(sm4_ce_crypt_block)
  86. .align 3
  87. SYM_FUNC_START(sm4_ce_crypt)
  88. /* input:
  89. * x0: round key array, CTX
  90. * x1: dst
  91. * x2: src
  92. * w3: nblocks
  93. */
  94. SM4_PREPARE(x0)
  95. .Lcrypt_loop_blk:
  96. sub w3, w3, #8;
  97. tbnz w3, #31, .Lcrypt_tail8;
  98. ld1 {v0.16b-v3.16b}, [x2], #64;
  99. ld1 {v4.16b-v7.16b}, [x2], #64;
  100. SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
  101. st1 {v0.16b-v3.16b}, [x1], #64;
  102. st1 {v4.16b-v7.16b}, [x1], #64;
  103. cbz w3, .Lcrypt_end;
  104. b .Lcrypt_loop_blk;
  105. .Lcrypt_tail8:
  106. add w3, w3, #8;
  107. cmp w3, #4;
  108. blt .Lcrypt_tail4;
  109. sub w3, w3, #4;
  110. ld1 {v0.16b-v3.16b}, [x2], #64;
  111. SM4_CRYPT_BLK4(v0, v1, v2, v3);
  112. st1 {v0.16b-v3.16b}, [x1], #64;
  113. cbz w3, .Lcrypt_end;
  114. .Lcrypt_tail4:
  115. sub w3, w3, #1;
  116. ld1 {v0.16b}, [x2], #16;
  117. SM4_CRYPT_BLK(v0);
  118. st1 {v0.16b}, [x1], #16;
  119. cbnz w3, .Lcrypt_tail4;
  120. .Lcrypt_end:
  121. ret;
  122. SYM_FUNC_END(sm4_ce_crypt)
  123. .align 3
  124. SYM_FUNC_START(sm4_ce_cbc_enc)
  125. /* input:
  126. * x0: round key array, CTX
  127. * x1: dst
  128. * x2: src
  129. * x3: iv (big endian, 128 bit)
  130. * w4: nblocks
  131. */
  132. SM4_PREPARE(x0)
  133. ld1 {RIV.16b}, [x3]
  134. .Lcbc_enc_loop_4x:
  135. cmp w4, #4
  136. blt .Lcbc_enc_loop_1x
  137. sub w4, w4, #4
  138. ld1 {v0.16b-v3.16b}, [x2], #64
  139. eor v0.16b, v0.16b, RIV.16b
  140. SM4_CRYPT_BLK(v0)
  141. eor v1.16b, v1.16b, v0.16b
  142. SM4_CRYPT_BLK(v1)
  143. eor v2.16b, v2.16b, v1.16b
  144. SM4_CRYPT_BLK(v2)
  145. eor v3.16b, v3.16b, v2.16b
  146. SM4_CRYPT_BLK(v3)
  147. st1 {v0.16b-v3.16b}, [x1], #64
  148. mov RIV.16b, v3.16b
  149. cbz w4, .Lcbc_enc_end
  150. b .Lcbc_enc_loop_4x
  151. .Lcbc_enc_loop_1x:
  152. sub w4, w4, #1
  153. ld1 {v0.16b}, [x2], #16
  154. eor RIV.16b, RIV.16b, v0.16b
  155. SM4_CRYPT_BLK(RIV)
  156. st1 {RIV.16b}, [x1], #16
  157. cbnz w4, .Lcbc_enc_loop_1x
  158. .Lcbc_enc_end:
  159. /* store new IV */
  160. st1 {RIV.16b}, [x3]
  161. ret
  162. SYM_FUNC_END(sm4_ce_cbc_enc)
  163. .align 3
  164. SYM_FUNC_START(sm4_ce_cbc_dec)
  165. /* input:
  166. * x0: round key array, CTX
  167. * x1: dst
  168. * x2: src
  169. * x3: iv (big endian, 128 bit)
  170. * w4: nblocks
  171. */
  172. SM4_PREPARE(x0)
  173. ld1 {RIV.16b}, [x3]
  174. .Lcbc_dec_loop_8x:
  175. sub w4, w4, #8
  176. tbnz w4, #31, .Lcbc_dec_4x
  177. ld1 {v0.16b-v3.16b}, [x2], #64
  178. ld1 {v4.16b-v7.16b}, [x2], #64
  179. rev32 v8.16b, v0.16b
  180. rev32 v9.16b, v1.16b
  181. rev32 v10.16b, v2.16b
  182. rev32 v11.16b, v3.16b
  183. rev32 v12.16b, v4.16b
  184. rev32 v13.16b, v5.16b
  185. rev32 v14.16b, v6.16b
  186. rev32 v15.16b, v7.16b
  187. SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
  188. eor v8.16b, v8.16b, RIV.16b
  189. eor v9.16b, v9.16b, v0.16b
  190. eor v10.16b, v10.16b, v1.16b
  191. eor v11.16b, v11.16b, v2.16b
  192. eor v12.16b, v12.16b, v3.16b
  193. eor v13.16b, v13.16b, v4.16b
  194. eor v14.16b, v14.16b, v5.16b
  195. eor v15.16b, v15.16b, v6.16b
  196. st1 {v8.16b-v11.16b}, [x1], #64
  197. st1 {v12.16b-v15.16b}, [x1], #64
  198. mov RIV.16b, v7.16b
  199. cbz w4, .Lcbc_dec_end
  200. b .Lcbc_dec_loop_8x
  201. .Lcbc_dec_4x:
  202. add w4, w4, #8
  203. cmp w4, #4
  204. blt .Lcbc_dec_loop_1x
  205. sub w4, w4, #4
  206. ld1 {v0.16b-v3.16b}, [x2], #64
  207. rev32 v8.16b, v0.16b
  208. rev32 v9.16b, v1.16b
  209. rev32 v10.16b, v2.16b
  210. rev32 v11.16b, v3.16b
  211. SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
  212. eor v8.16b, v8.16b, RIV.16b
  213. eor v9.16b, v9.16b, v0.16b
  214. eor v10.16b, v10.16b, v1.16b
  215. eor v11.16b, v11.16b, v2.16b
  216. st1 {v8.16b-v11.16b}, [x1], #64
  217. mov RIV.16b, v3.16b
  218. cbz w4, .Lcbc_dec_end
  219. .Lcbc_dec_loop_1x:
  220. sub w4, w4, #1
  221. ld1 {v0.16b}, [x2], #16
  222. rev32 v8.16b, v0.16b
  223. SM4_CRYPT_BLK_BE(v8)
  224. eor v8.16b, v8.16b, RIV.16b
  225. st1 {v8.16b}, [x1], #16
  226. mov RIV.16b, v0.16b
  227. cbnz w4, .Lcbc_dec_loop_1x
  228. .Lcbc_dec_end:
  229. /* store new IV */
  230. st1 {RIV.16b}, [x3]
  231. ret
  232. SYM_FUNC_END(sm4_ce_cbc_dec)
  233. .align 3
  234. SYM_FUNC_START(sm4_ce_cbc_cts_enc)
  235. /* input:
  236. * x0: round key array, CTX
  237. * x1: dst
  238. * x2: src
  239. * x3: iv (big endian, 128 bit)
  240. * w4: nbytes
  241. */
  242. SM4_PREPARE(x0)
  243. sub w5, w4, #16
  244. uxtw x5, w5
  245. ld1 {RIV.16b}, [x3]
  246. ld1 {v0.16b}, [x2]
  247. eor RIV.16b, RIV.16b, v0.16b
  248. SM4_CRYPT_BLK(RIV)
  249. /* load permute table */
  250. adr_l x6, .Lcts_permute_table
  251. add x7, x6, #32
  252. add x6, x6, x5
  253. sub x7, x7, x5
  254. ld1 {v3.16b}, [x6]
  255. ld1 {v4.16b}, [x7]
  256. /* overlapping loads */
  257. add x2, x2, x5
  258. ld1 {v1.16b}, [x2]
  259. /* create Cn from En-1 */
  260. tbl v0.16b, {RIV.16b}, v3.16b
  261. /* padding Pn with zeros */
  262. tbl v1.16b, {v1.16b}, v4.16b
  263. eor v1.16b, v1.16b, RIV.16b
  264. SM4_CRYPT_BLK(v1)
  265. /* overlapping stores */
  266. add x5, x1, x5
  267. st1 {v0.16b}, [x5]
  268. st1 {v1.16b}, [x1]
  269. ret
  270. SYM_FUNC_END(sm4_ce_cbc_cts_enc)
  271. .align 3
  272. SYM_FUNC_START(sm4_ce_cbc_cts_dec)
  273. /* input:
  274. * x0: round key array, CTX
  275. * x1: dst
  276. * x2: src
  277. * x3: iv (big endian, 128 bit)
  278. * w4: nbytes
  279. */
  280. SM4_PREPARE(x0)
  281. sub w5, w4, #16
  282. uxtw x5, w5
  283. ld1 {RIV.16b}, [x3]
  284. /* load permute table */
  285. adr_l x6, .Lcts_permute_table
  286. add x7, x6, #32
  287. add x6, x6, x5
  288. sub x7, x7, x5
  289. ld1 {v3.16b}, [x6]
  290. ld1 {v4.16b}, [x7]
  291. /* overlapping loads */
  292. ld1 {v0.16b}, [x2], x5
  293. ld1 {v1.16b}, [x2]
  294. SM4_CRYPT_BLK(v0)
  295. /* select the first Ln bytes of Xn to create Pn */
  296. tbl v2.16b, {v0.16b}, v3.16b
  297. eor v2.16b, v2.16b, v1.16b
  298. /* overwrite the first Ln bytes with Cn to create En-1 */
  299. tbx v0.16b, {v1.16b}, v4.16b
  300. SM4_CRYPT_BLK(v0)
  301. eor v0.16b, v0.16b, RIV.16b
  302. /* overlapping stores */
  303. add x5, x1, x5
  304. st1 {v2.16b}, [x5]
  305. st1 {v0.16b}, [x1]
  306. ret
  307. SYM_FUNC_END(sm4_ce_cbc_cts_dec)
  308. .align 3
  309. SYM_FUNC_START(sm4_ce_ctr_enc)
  310. /* input:
  311. * x0: round key array, CTX
  312. * x1: dst
  313. * x2: src
  314. * x3: ctr (big endian, 128 bit)
  315. * w4: nblocks
  316. */
  317. SM4_PREPARE(x0)
  318. ldp x7, x8, [x3]
  319. rev x7, x7
  320. rev x8, x8
  321. .Lctr_loop_8x:
  322. sub w4, w4, #8
  323. tbnz w4, #31, .Lctr_4x
  324. #define inc_le128(vctr) \
  325. mov vctr.d[1], x8; \
  326. mov vctr.d[0], x7; \
  327. adds x8, x8, #1; \
  328. rev64 vctr.16b, vctr.16b; \
  329. adc x7, x7, xzr;
  330. /* construct CTRs */
  331. inc_le128(v0) /* +0 */
  332. inc_le128(v1) /* +1 */
  333. inc_le128(v2) /* +2 */
  334. inc_le128(v3) /* +3 */
  335. inc_le128(v4) /* +4 */
  336. inc_le128(v5) /* +5 */
  337. inc_le128(v6) /* +6 */
  338. inc_le128(v7) /* +7 */
  339. ld1 {v8.16b-v11.16b}, [x2], #64
  340. ld1 {v12.16b-v15.16b}, [x2], #64
  341. SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
  342. eor v0.16b, v0.16b, v8.16b
  343. eor v1.16b, v1.16b, v9.16b
  344. eor v2.16b, v2.16b, v10.16b
  345. eor v3.16b, v3.16b, v11.16b
  346. eor v4.16b, v4.16b, v12.16b
  347. eor v5.16b, v5.16b, v13.16b
  348. eor v6.16b, v6.16b, v14.16b
  349. eor v7.16b, v7.16b, v15.16b
  350. st1 {v0.16b-v3.16b}, [x1], #64
  351. st1 {v4.16b-v7.16b}, [x1], #64
  352. cbz w4, .Lctr_end
  353. b .Lctr_loop_8x
  354. .Lctr_4x:
  355. add w4, w4, #8
  356. cmp w4, #4
  357. blt .Lctr_loop_1x
  358. sub w4, w4, #4
  359. /* construct CTRs */
  360. inc_le128(v0) /* +0 */
  361. inc_le128(v1) /* +1 */
  362. inc_le128(v2) /* +2 */
  363. inc_le128(v3) /* +3 */
  364. ld1 {v8.16b-v11.16b}, [x2], #64
  365. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  366. eor v0.16b, v0.16b, v8.16b
  367. eor v1.16b, v1.16b, v9.16b
  368. eor v2.16b, v2.16b, v10.16b
  369. eor v3.16b, v3.16b, v11.16b
  370. st1 {v0.16b-v3.16b}, [x1], #64
  371. cbz w4, .Lctr_end
  372. .Lctr_loop_1x:
  373. sub w4, w4, #1
  374. /* construct CTRs */
  375. inc_le128(v0)
  376. ld1 {v8.16b}, [x2], #16
  377. SM4_CRYPT_BLK(v0)
  378. eor v0.16b, v0.16b, v8.16b
  379. st1 {v0.16b}, [x1], #16
  380. cbnz w4, .Lctr_loop_1x
  381. .Lctr_end:
  382. /* store new CTR */
  383. rev x7, x7
  384. rev x8, x8
  385. stp x7, x8, [x3]
  386. ret
  387. SYM_FUNC_END(sm4_ce_ctr_enc)
  388. #define tweak_next(vt, vin, RTMP) \
  389. sshr RTMP.2d, vin.2d, #63; \
  390. and RTMP.16b, RTMP.16b, RMASK.16b; \
  391. add vt.2d, vin.2d, vin.2d; \
  392. ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
  393. eor vt.16b, vt.16b, RTMP.16b;
  394. .align 3
  395. SYM_FUNC_START(sm4_ce_xts_enc)
  396. /* input:
  397. * x0: round key array, CTX
  398. * x1: dst
  399. * x2: src
  400. * x3: tweak (big endian, 128 bit)
  401. * w4: nbytes
  402. * x5: round key array for IV
  403. */
  404. ld1 {v8.16b}, [x3]
  405. cbz x5, .Lxts_enc_nofirst
  406. SM4_PREPARE(x5)
  407. /* Generate first tweak */
  408. SM4_CRYPT_BLK(v8)
  409. .Lxts_enc_nofirst:
  410. SM4_PREPARE(x0)
  411. ands w5, w4, #15
  412. lsr w4, w4, #4
  413. sub w6, w4, #1
  414. csel w4, w4, w6, eq
  415. uxtw x5, w5
  416. movi RMASK.2s, #0x1
  417. movi RTMP0.2s, #0x87
  418. uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
  419. cbz w4, .Lxts_enc_cts
  420. .Lxts_enc_loop_8x:
  421. sub w4, w4, #8
  422. tbnz w4, #31, .Lxts_enc_4x
  423. tweak_next( v9, v8, RTMP0)
  424. tweak_next(v10, v9, RTMP1)
  425. tweak_next(v11, v10, RTMP2)
  426. tweak_next(v12, v11, RTMP3)
  427. tweak_next(v13, v12, RTMP0)
  428. tweak_next(v14, v13, RTMP1)
  429. tweak_next(v15, v14, RTMP2)
  430. ld1 {v0.16b-v3.16b}, [x2], #64
  431. ld1 {v4.16b-v7.16b}, [x2], #64
  432. eor v0.16b, v0.16b, v8.16b
  433. eor v1.16b, v1.16b, v9.16b
  434. eor v2.16b, v2.16b, v10.16b
  435. eor v3.16b, v3.16b, v11.16b
  436. eor v4.16b, v4.16b, v12.16b
  437. eor v5.16b, v5.16b, v13.16b
  438. eor v6.16b, v6.16b, v14.16b
  439. eor v7.16b, v7.16b, v15.16b
  440. SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
  441. eor v0.16b, v0.16b, v8.16b
  442. eor v1.16b, v1.16b, v9.16b
  443. eor v2.16b, v2.16b, v10.16b
  444. eor v3.16b, v3.16b, v11.16b
  445. eor v4.16b, v4.16b, v12.16b
  446. eor v5.16b, v5.16b, v13.16b
  447. eor v6.16b, v6.16b, v14.16b
  448. eor v7.16b, v7.16b, v15.16b
  449. st1 {v0.16b-v3.16b}, [x1], #64
  450. st1 {v4.16b-v7.16b}, [x1], #64
  451. tweak_next(v8, v15, RTMP3)
  452. cbz w4, .Lxts_enc_cts
  453. b .Lxts_enc_loop_8x
  454. .Lxts_enc_4x:
  455. add w4, w4, #8
  456. cmp w4, #4
  457. blt .Lxts_enc_loop_1x
  458. sub w4, w4, #4
  459. tweak_next( v9, v8, RTMP0)
  460. tweak_next(v10, v9, RTMP1)
  461. tweak_next(v11, v10, RTMP2)
  462. ld1 {v0.16b-v3.16b}, [x2], #64
  463. eor v0.16b, v0.16b, v8.16b
  464. eor v1.16b, v1.16b, v9.16b
  465. eor v2.16b, v2.16b, v10.16b
  466. eor v3.16b, v3.16b, v11.16b
  467. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  468. eor v0.16b, v0.16b, v8.16b
  469. eor v1.16b, v1.16b, v9.16b
  470. eor v2.16b, v2.16b, v10.16b
  471. eor v3.16b, v3.16b, v11.16b
  472. st1 {v0.16b-v3.16b}, [x1], #64
  473. tweak_next(v8, v11, RTMP3)
  474. cbz w4, .Lxts_enc_cts
  475. .Lxts_enc_loop_1x:
  476. sub w4, w4, #1
  477. ld1 {v0.16b}, [x2], #16
  478. eor v0.16b, v0.16b, v8.16b
  479. SM4_CRYPT_BLK(v0)
  480. eor v0.16b, v0.16b, v8.16b
  481. st1 {v0.16b}, [x1], #16
  482. tweak_next(v8, v8, RTMP0)
  483. cbnz w4, .Lxts_enc_loop_1x
  484. .Lxts_enc_cts:
  485. cbz x5, .Lxts_enc_end
  486. /* cipher text stealing */
  487. tweak_next(v9, v8, RTMP0)
  488. ld1 {v0.16b}, [x2]
  489. eor v0.16b, v0.16b, v8.16b
  490. SM4_CRYPT_BLK(v0)
  491. eor v0.16b, v0.16b, v8.16b
  492. /* load permute table */
  493. adr_l x6, .Lcts_permute_table
  494. add x7, x6, #32
  495. add x6, x6, x5
  496. sub x7, x7, x5
  497. ld1 {v3.16b}, [x6]
  498. ld1 {v4.16b}, [x7]
  499. /* overlapping loads */
  500. add x2, x2, x5
  501. ld1 {v1.16b}, [x2]
  502. /* create Cn from En-1 */
  503. tbl v2.16b, {v0.16b}, v3.16b
  504. /* padding Pn with En-1 at the end */
  505. tbx v0.16b, {v1.16b}, v4.16b
  506. eor v0.16b, v0.16b, v9.16b
  507. SM4_CRYPT_BLK(v0)
  508. eor v0.16b, v0.16b, v9.16b
  509. /* overlapping stores */
  510. add x5, x1, x5
  511. st1 {v2.16b}, [x5]
  512. st1 {v0.16b}, [x1]
  513. b .Lxts_enc_ret
  514. .Lxts_enc_end:
  515. /* store new tweak */
  516. st1 {v8.16b}, [x3]
  517. .Lxts_enc_ret:
  518. ret
  519. SYM_FUNC_END(sm4_ce_xts_enc)
  520. .align 3
  521. SYM_FUNC_START(sm4_ce_xts_dec)
  522. /* input:
  523. * x0: round key array, CTX
  524. * x1: dst
  525. * x2: src
  526. * x3: tweak (big endian, 128 bit)
  527. * w4: nbytes
  528. * x5: round key array for IV
  529. */
  530. ld1 {v8.16b}, [x3]
  531. cbz x5, .Lxts_dec_nofirst
  532. SM4_PREPARE(x5)
  533. /* Generate first tweak */
  534. SM4_CRYPT_BLK(v8)
  535. .Lxts_dec_nofirst:
  536. SM4_PREPARE(x0)
  537. ands w5, w4, #15
  538. lsr w4, w4, #4
  539. sub w6, w4, #1
  540. csel w4, w4, w6, eq
  541. uxtw x5, w5
  542. movi RMASK.2s, #0x1
  543. movi RTMP0.2s, #0x87
  544. uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
  545. cbz w4, .Lxts_dec_cts
  546. .Lxts_dec_loop_8x:
  547. sub w4, w4, #8
  548. tbnz w4, #31, .Lxts_dec_4x
  549. tweak_next( v9, v8, RTMP0)
  550. tweak_next(v10, v9, RTMP1)
  551. tweak_next(v11, v10, RTMP2)
  552. tweak_next(v12, v11, RTMP3)
  553. tweak_next(v13, v12, RTMP0)
  554. tweak_next(v14, v13, RTMP1)
  555. tweak_next(v15, v14, RTMP2)
  556. ld1 {v0.16b-v3.16b}, [x2], #64
  557. ld1 {v4.16b-v7.16b}, [x2], #64
  558. eor v0.16b, v0.16b, v8.16b
  559. eor v1.16b, v1.16b, v9.16b
  560. eor v2.16b, v2.16b, v10.16b
  561. eor v3.16b, v3.16b, v11.16b
  562. eor v4.16b, v4.16b, v12.16b
  563. eor v5.16b, v5.16b, v13.16b
  564. eor v6.16b, v6.16b, v14.16b
  565. eor v7.16b, v7.16b, v15.16b
  566. SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
  567. eor v0.16b, v0.16b, v8.16b
  568. eor v1.16b, v1.16b, v9.16b
  569. eor v2.16b, v2.16b, v10.16b
  570. eor v3.16b, v3.16b, v11.16b
  571. eor v4.16b, v4.16b, v12.16b
  572. eor v5.16b, v5.16b, v13.16b
  573. eor v6.16b, v6.16b, v14.16b
  574. eor v7.16b, v7.16b, v15.16b
  575. st1 {v0.16b-v3.16b}, [x1], #64
  576. st1 {v4.16b-v7.16b}, [x1], #64
  577. tweak_next(v8, v15, RTMP3)
  578. cbz w4, .Lxts_dec_cts
  579. b .Lxts_dec_loop_8x
  580. .Lxts_dec_4x:
  581. add w4, w4, #8
  582. cmp w4, #4
  583. blt .Lxts_dec_loop_1x
  584. sub w4, w4, #4
  585. tweak_next( v9, v8, RTMP0)
  586. tweak_next(v10, v9, RTMP1)
  587. tweak_next(v11, v10, RTMP2)
  588. ld1 {v0.16b-v3.16b}, [x2], #64
  589. eor v0.16b, v0.16b, v8.16b
  590. eor v1.16b, v1.16b, v9.16b
  591. eor v2.16b, v2.16b, v10.16b
  592. eor v3.16b, v3.16b, v11.16b
  593. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  594. eor v0.16b, v0.16b, v8.16b
  595. eor v1.16b, v1.16b, v9.16b
  596. eor v2.16b, v2.16b, v10.16b
  597. eor v3.16b, v3.16b, v11.16b
  598. st1 {v0.16b-v3.16b}, [x1], #64
  599. tweak_next(v8, v11, RTMP3)
  600. cbz w4, .Lxts_dec_cts
  601. .Lxts_dec_loop_1x:
  602. sub w4, w4, #1
  603. ld1 {v0.16b}, [x2], #16
  604. eor v0.16b, v0.16b, v8.16b
  605. SM4_CRYPT_BLK(v0)
  606. eor v0.16b, v0.16b, v8.16b
  607. st1 {v0.16b}, [x1], #16
  608. tweak_next(v8, v8, RTMP0)
  609. cbnz w4, .Lxts_dec_loop_1x
  610. .Lxts_dec_cts:
  611. cbz x5, .Lxts_dec_end
  612. /* cipher text stealing */
  613. tweak_next(v9, v8, RTMP0)
  614. ld1 {v0.16b}, [x2]
  615. eor v0.16b, v0.16b, v9.16b
  616. SM4_CRYPT_BLK(v0)
  617. eor v0.16b, v0.16b, v9.16b
  618. /* load permute table */
  619. adr_l x6, .Lcts_permute_table
  620. add x7, x6, #32
  621. add x6, x6, x5
  622. sub x7, x7, x5
  623. ld1 {v3.16b}, [x6]
  624. ld1 {v4.16b}, [x7]
  625. /* overlapping loads */
  626. add x2, x2, x5
  627. ld1 {v1.16b}, [x2]
  628. /* create Cn from En-1 */
  629. tbl v2.16b, {v0.16b}, v3.16b
  630. /* padding Pn with En-1 at the end */
  631. tbx v0.16b, {v1.16b}, v4.16b
  632. eor v0.16b, v0.16b, v8.16b
  633. SM4_CRYPT_BLK(v0)
  634. eor v0.16b, v0.16b, v8.16b
  635. /* overlapping stores */
  636. add x5, x1, x5
  637. st1 {v2.16b}, [x5]
  638. st1 {v0.16b}, [x1]
  639. b .Lxts_dec_ret
  640. .Lxts_dec_end:
  641. /* store new tweak */
  642. st1 {v8.16b}, [x3]
  643. .Lxts_dec_ret:
  644. ret
  645. SYM_FUNC_END(sm4_ce_xts_dec)
  646. .align 3
  647. SYM_FUNC_START(sm4_ce_mac_update)
  648. /* input:
  649. * x0: round key array, CTX
  650. * x1: digest
  651. * x2: src
  652. * w3: nblocks
  653. * w4: enc_before
  654. * w5: enc_after
  655. */
  656. SM4_PREPARE(x0)
  657. ld1 {RMAC.16b}, [x1]
  658. cbz w4, .Lmac_update
  659. SM4_CRYPT_BLK(RMAC)
  660. .Lmac_update:
  661. cbz w3, .Lmac_ret
  662. sub w6, w3, #1
  663. cmp w5, wzr
  664. csel w3, w3, w6, ne
  665. cbz w3, .Lmac_end
  666. .Lmac_loop_4x:
  667. cmp w3, #4
  668. blt .Lmac_loop_1x
  669. sub w3, w3, #4
  670. ld1 {v0.16b-v3.16b}, [x2], #64
  671. eor RMAC.16b, RMAC.16b, v0.16b
  672. SM4_CRYPT_BLK(RMAC)
  673. eor RMAC.16b, RMAC.16b, v1.16b
  674. SM4_CRYPT_BLK(RMAC)
  675. eor RMAC.16b, RMAC.16b, v2.16b
  676. SM4_CRYPT_BLK(RMAC)
  677. eor RMAC.16b, RMAC.16b, v3.16b
  678. SM4_CRYPT_BLK(RMAC)
  679. cbz w3, .Lmac_end
  680. b .Lmac_loop_4x
  681. .Lmac_loop_1x:
  682. sub w3, w3, #1
  683. ld1 {v0.16b}, [x2], #16
  684. eor RMAC.16b, RMAC.16b, v0.16b
  685. SM4_CRYPT_BLK(RMAC)
  686. cbnz w3, .Lmac_loop_1x
  687. .Lmac_end:
  688. cbnz w5, .Lmac_ret
  689. ld1 {v0.16b}, [x2], #16
  690. eor RMAC.16b, RMAC.16b, v0.16b
  691. .Lmac_ret:
  692. st1 {RMAC.16b}, [x1]
  693. ret
  694. SYM_FUNC_END(sm4_ce_mac_update)
  695. .section ".rodata", "a"
  696. .align 4
  697. .Lbswap128_mask:
  698. .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
  699. .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
  700. .Lcts_permute_table:
  701. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  702. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  703. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  704. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
  705. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  706. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff