sm4-neon-core.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * SM4 Cipher Algorithm for ARMv8 NEON
  4. * as specified in
  5. * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
  6. *
  7. * Copyright (C) 2022, Alibaba Group.
  8. * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. /* Register macros */
  13. #define RTMP0 v8
  14. #define RTMP1 v9
  15. #define RTMP2 v10
  16. #define RTMP3 v11
  17. #define RTMP4 v12
  18. #define RTMP5 v13
  19. #define RTMP6 v14
  20. #define RTMP7 v15
  21. #define RX0 v12
  22. #define RX1 v13
  23. #define RKEY v14
  24. #define RIV v15
  25. /* Helper macros. */
  26. #define SM4_PREPARE() \
  27. adr_l x5, crypto_sm4_sbox; \
  28. ld1 {v16.16b-v19.16b}, [x5], #64; \
  29. ld1 {v20.16b-v23.16b}, [x5], #64; \
  30. ld1 {v24.16b-v27.16b}, [x5], #64; \
  31. ld1 {v28.16b-v31.16b}, [x5];
  32. #define transpose_4x4(s0, s1, s2, s3) \
  33. zip1 RTMP0.4s, s0.4s, s1.4s; \
  34. zip1 RTMP1.4s, s2.4s, s3.4s; \
  35. zip2 RTMP2.4s, s0.4s, s1.4s; \
  36. zip2 RTMP3.4s, s2.4s, s3.4s; \
  37. zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
  38. zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
  39. zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
  40. zip2 s3.2d, RTMP2.2d, RTMP3.2d;
  41. #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
  42. zip1 RTMP0.4s, s0.4s, s1.4s; \
  43. zip1 RTMP1.4s, s2.4s, s3.4s; \
  44. zip2 RTMP2.4s, s0.4s, s1.4s; \
  45. zip2 RTMP3.4s, s2.4s, s3.4s; \
  46. zip1 RTMP4.4s, s4.4s, s5.4s; \
  47. zip1 RTMP5.4s, s6.4s, s7.4s; \
  48. zip2 RTMP6.4s, s4.4s, s5.4s; \
  49. zip2 RTMP7.4s, s6.4s, s7.4s; \
  50. zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
  51. zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
  52. zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
  53. zip2 s3.2d, RTMP2.2d, RTMP3.2d; \
  54. zip1 s4.2d, RTMP4.2d, RTMP5.2d; \
  55. zip2 s5.2d, RTMP4.2d, RTMP5.2d; \
  56. zip1 s6.2d, RTMP6.2d, RTMP7.2d; \
  57. zip2 s7.2d, RTMP6.2d, RTMP7.2d;
  58. #define rotate_clockwise_4x4(s0, s1, s2, s3) \
  59. zip1 RTMP0.4s, s1.4s, s0.4s; \
  60. zip2 RTMP1.4s, s1.4s, s0.4s; \
  61. zip1 RTMP2.4s, s3.4s, s2.4s; \
  62. zip2 RTMP3.4s, s3.4s, s2.4s; \
  63. zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
  64. zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
  65. zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
  66. zip2 s3.2d, RTMP3.2d, RTMP1.2d;
  67. #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
  68. zip1 RTMP0.4s, s1.4s, s0.4s; \
  69. zip1 RTMP2.4s, s3.4s, s2.4s; \
  70. zip2 RTMP1.4s, s1.4s, s0.4s; \
  71. zip2 RTMP3.4s, s3.4s, s2.4s; \
  72. zip1 RTMP4.4s, s5.4s, s4.4s; \
  73. zip1 RTMP6.4s, s7.4s, s6.4s; \
  74. zip2 RTMP5.4s, s5.4s, s4.4s; \
  75. zip2 RTMP7.4s, s7.4s, s6.4s; \
  76. zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
  77. zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
  78. zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
  79. zip2 s3.2d, RTMP3.2d, RTMP1.2d; \
  80. zip1 s4.2d, RTMP6.2d, RTMP4.2d; \
  81. zip2 s5.2d, RTMP6.2d, RTMP4.2d; \
  82. zip1 s6.2d, RTMP7.2d, RTMP5.2d; \
  83. zip2 s7.2d, RTMP7.2d, RTMP5.2d;
  84. #define ROUND4(round, s0, s1, s2, s3) \
  85. dup RX0.4s, RKEY.s[round]; \
  86. /* rk ^ s1 ^ s2 ^ s3 */ \
  87. eor RTMP1.16b, s2.16b, s3.16b; \
  88. eor RX0.16b, RX0.16b, s1.16b; \
  89. eor RX0.16b, RX0.16b, RTMP1.16b; \
  90. \
  91. /* sbox, non-linear part */ \
  92. movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
  93. tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
  94. sub RX0.16b, RX0.16b, RTMP3.16b; \
  95. tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
  96. sub RX0.16b, RX0.16b, RTMP3.16b; \
  97. tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
  98. sub RX0.16b, RX0.16b, RTMP3.16b; \
  99. tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
  100. \
  101. /* linear part */ \
  102. shl RTMP1.4s, RTMP0.4s, #8; \
  103. shl RTMP2.4s, RTMP0.4s, #16; \
  104. shl RTMP3.4s, RTMP0.4s, #24; \
  105. sri RTMP1.4s, RTMP0.4s, #(32-8); \
  106. sri RTMP2.4s, RTMP0.4s, #(32-16); \
  107. sri RTMP3.4s, RTMP0.4s, #(32-24); \
  108. /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
  109. eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
  110. eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
  111. /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
  112. eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
  113. shl RTMP2.4s, RTMP1.4s, 2; \
  114. sri RTMP2.4s, RTMP1.4s, #(32-2); \
  115. eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
  116. /* s0 ^= RTMP3 */ \
  117. eor s0.16b, s0.16b, RTMP3.16b;
  118. #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
  119. mov x6, 8; \
  120. 4: \
  121. ld1 {RKEY.4s}, [x0], #16; \
  122. subs x6, x6, #1; \
  123. \
  124. ROUND4(0, b0, b1, b2, b3); \
  125. ROUND4(1, b1, b2, b3, b0); \
  126. ROUND4(2, b2, b3, b0, b1); \
  127. ROUND4(3, b3, b0, b1, b2); \
  128. \
  129. bne 4b; \
  130. \
  131. rev32 b0.16b, b0.16b; \
  132. rev32 b1.16b, b1.16b; \
  133. rev32 b2.16b, b2.16b; \
  134. rev32 b3.16b, b3.16b; \
  135. \
  136. rotate_clockwise_4x4(b0, b1, b2, b3); \
  137. \
  138. /* repoint to rkey */ \
  139. sub x0, x0, #128;
  140. #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
  141. rev32 b0.16b, b0.16b; \
  142. rev32 b1.16b, b1.16b; \
  143. rev32 b2.16b, b2.16b; \
  144. rev32 b3.16b, b3.16b; \
  145. SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
  146. #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
  147. /* rk ^ s1 ^ s2 ^ s3 */ \
  148. dup RX0.4s, RKEY.s[round]; \
  149. eor RTMP0.16b, s2.16b, s3.16b; \
  150. mov RX1.16b, RX0.16b; \
  151. eor RTMP1.16b, t2.16b, t3.16b; \
  152. eor RX0.16b, RX0.16b, s1.16b; \
  153. eor RX1.16b, RX1.16b, t1.16b; \
  154. eor RX0.16b, RX0.16b, RTMP0.16b; \
  155. eor RX1.16b, RX1.16b, RTMP1.16b; \
  156. \
  157. /* sbox, non-linear part */ \
  158. movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
  159. tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
  160. tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
  161. sub RX0.16b, RX0.16b, RTMP3.16b; \
  162. sub RX1.16b, RX1.16b, RTMP3.16b; \
  163. tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
  164. tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
  165. sub RX0.16b, RX0.16b, RTMP3.16b; \
  166. sub RX1.16b, RX1.16b, RTMP3.16b; \
  167. tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
  168. tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
  169. sub RX0.16b, RX0.16b, RTMP3.16b; \
  170. sub RX1.16b, RX1.16b, RTMP3.16b; \
  171. tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
  172. tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
  173. \
  174. /* linear part */ \
  175. shl RX0.4s, RTMP0.4s, #8; \
  176. shl RX1.4s, RTMP1.4s, #8; \
  177. shl RTMP2.4s, RTMP0.4s, #16; \
  178. shl RTMP3.4s, RTMP1.4s, #16; \
  179. sri RX0.4s, RTMP0.4s, #(32 - 8); \
  180. sri RX1.4s, RTMP1.4s, #(32 - 8); \
  181. sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
  182. sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
  183. /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
  184. eor RX0.16b, RX0.16b, RTMP0.16b; \
  185. eor RX1.16b, RX1.16b, RTMP1.16b; \
  186. eor RX0.16b, RX0.16b, RTMP2.16b; \
  187. eor RX1.16b, RX1.16b, RTMP3.16b; \
  188. /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
  189. shl RTMP2.4s, RTMP0.4s, #24; \
  190. shl RTMP3.4s, RTMP1.4s, #24; \
  191. sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
  192. sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
  193. eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
  194. eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
  195. shl RTMP2.4s, RX0.4s, #2; \
  196. shl RTMP3.4s, RX1.4s, #2; \
  197. sri RTMP2.4s, RX0.4s, #(32 - 2); \
  198. sri RTMP3.4s, RX1.4s, #(32 - 2); \
  199. eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
  200. eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
  201. /* s0/t0 ^= RTMP0/1 */ \
  202. eor s0.16b, s0.16b, RTMP0.16b; \
  203. eor t0.16b, t0.16b, RTMP1.16b;
  204. #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
  205. rev32 b0.16b, b0.16b; \
  206. rev32 b1.16b, b1.16b; \
  207. rev32 b2.16b, b2.16b; \
  208. rev32 b3.16b, b3.16b; \
  209. rev32 b4.16b, b4.16b; \
  210. rev32 b5.16b, b5.16b; \
  211. rev32 b6.16b, b6.16b; \
  212. rev32 b7.16b, b7.16b; \
  213. \
  214. mov x6, 8; \
  215. 8: \
  216. ld1 {RKEY.4s}, [x0], #16; \
  217. subs x6, x6, #1; \
  218. \
  219. ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
  220. ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
  221. ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
  222. ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
  223. \
  224. bne 8b; \
  225. \
  226. rev32 b0.16b, b0.16b; \
  227. rev32 b1.16b, b1.16b; \
  228. rev32 b2.16b, b2.16b; \
  229. rev32 b3.16b, b3.16b; \
  230. rev32 b4.16b, b4.16b; \
  231. rev32 b5.16b, b5.16b; \
  232. rev32 b6.16b, b6.16b; \
  233. rev32 b7.16b, b7.16b; \
  234. \
  235. /* repoint to rkey */ \
  236. sub x0, x0, #128;
  237. #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
  238. SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
  239. rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
  240. .align 3
  241. SYM_FUNC_START(sm4_neon_crypt)
  242. /* input:
  243. * x0: round key array, CTX
  244. * x1: dst
  245. * x2: src
  246. * w3: nblocks
  247. */
  248. SM4_PREPARE()
  249. .Lcrypt_loop_8x:
  250. sub w3, w3, #8
  251. tbnz w3, #31, .Lcrypt_4x
  252. ld4 {v0.4s-v3.4s}, [x2], #64
  253. ld4 {v4.4s-v7.4s}, [x2], #64
  254. SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
  255. st1 {v0.16b-v3.16b}, [x1], #64
  256. st1 {v4.16b-v7.16b}, [x1], #64
  257. cbz w3, .Lcrypt_end
  258. b .Lcrypt_loop_8x
  259. .Lcrypt_4x:
  260. add w3, w3, #8
  261. cmp w3, #4
  262. blt .Lcrypt_tail
  263. sub w3, w3, #4
  264. ld4 {v0.4s-v3.4s}, [x2], #64
  265. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  266. st1 {v0.16b-v3.16b}, [x1], #64
  267. cbz w3, .Lcrypt_end
  268. .Lcrypt_tail:
  269. cmp w3, #2
  270. ld1 {v0.16b}, [x2], #16
  271. blt .Lcrypt_tail_load_done
  272. ld1 {v1.16b}, [x2], #16
  273. beq .Lcrypt_tail_load_done
  274. ld1 {v2.16b}, [x2], #16
  275. .Lcrypt_tail_load_done:
  276. transpose_4x4(v0, v1, v2, v3)
  277. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  278. cmp w3, #2
  279. st1 {v0.16b}, [x1], #16
  280. blt .Lcrypt_end
  281. st1 {v1.16b}, [x1], #16
  282. beq .Lcrypt_end
  283. st1 {v2.16b}, [x1], #16
  284. .Lcrypt_end:
  285. ret
  286. SYM_FUNC_END(sm4_neon_crypt)
  287. .align 3
  288. SYM_FUNC_START(sm4_neon_cbc_dec)
  289. /* input:
  290. * x0: round key array, CTX
  291. * x1: dst
  292. * x2: src
  293. * x3: iv (big endian, 128 bit)
  294. * w4: nblocks
  295. */
  296. SM4_PREPARE()
  297. ld1 {RIV.16b}, [x3]
  298. .Lcbc_dec_loop_8x:
  299. sub w4, w4, #8
  300. tbnz w4, #31, .Lcbc_dec_4x
  301. ld4 {v0.4s-v3.4s}, [x2], #64
  302. ld4 {v4.4s-v7.4s}, [x2]
  303. SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
  304. /* Avoid overwriting the RIV register */
  305. rotate_clockwise_4x4(v0, v1, v2, v3)
  306. rotate_clockwise_4x4(v4, v5, v6, v7)
  307. sub x2, x2, #64
  308. eor v0.16b, v0.16b, RIV.16b
  309. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
  310. ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
  311. eor v1.16b, v1.16b, RTMP0.16b
  312. eor v2.16b, v2.16b, RTMP1.16b
  313. eor v3.16b, v3.16b, RTMP2.16b
  314. eor v4.16b, v4.16b, RTMP3.16b
  315. eor v5.16b, v5.16b, RTMP4.16b
  316. eor v6.16b, v6.16b, RTMP5.16b
  317. eor v7.16b, v7.16b, RTMP6.16b
  318. mov RIV.16b, RTMP7.16b
  319. st1 {v0.16b-v3.16b}, [x1], #64
  320. st1 {v4.16b-v7.16b}, [x1], #64
  321. cbz w4, .Lcbc_dec_end
  322. b .Lcbc_dec_loop_8x
  323. .Lcbc_dec_4x:
  324. add w4, w4, #8
  325. cmp w4, #4
  326. blt .Lcbc_dec_tail
  327. sub w4, w4, #4
  328. ld1 {v0.16b-v3.16b}, [x2], #64
  329. rev32 v4.16b, v0.16b
  330. rev32 v5.16b, v1.16b
  331. rev32 v6.16b, v2.16b
  332. rev32 v7.16b, v3.16b
  333. transpose_4x4(v4, v5, v6, v7)
  334. SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
  335. eor v4.16b, v4.16b, RIV.16b
  336. eor v5.16b, v5.16b, v0.16b
  337. eor v6.16b, v6.16b, v1.16b
  338. eor v7.16b, v7.16b, v2.16b
  339. mov RIV.16b, v3.16b
  340. st1 {v4.16b-v7.16b}, [x1], #64
  341. cbz w4, .Lcbc_dec_end
  342. .Lcbc_dec_tail:
  343. cmp w4, #2
  344. ld1 {v0.16b}, [x2], #16
  345. blt .Lcbc_dec_tail_load_done
  346. ld1 {v1.16b}, [x2], #16
  347. beq .Lcbc_dec_tail_load_done
  348. ld1 {v2.16b}, [x2], #16
  349. .Lcbc_dec_tail_load_done:
  350. rev32 v4.16b, v0.16b
  351. rev32 v5.16b, v1.16b
  352. rev32 v6.16b, v2.16b
  353. transpose_4x4(v4, v5, v6, v7)
  354. SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
  355. cmp w4, #2
  356. eor v4.16b, v4.16b, RIV.16b
  357. mov RIV.16b, v0.16b
  358. st1 {v4.16b}, [x1], #16
  359. blt .Lcbc_dec_end
  360. eor v5.16b, v5.16b, v0.16b
  361. mov RIV.16b, v1.16b
  362. st1 {v5.16b}, [x1], #16
  363. beq .Lcbc_dec_end
  364. eor v6.16b, v6.16b, v1.16b
  365. mov RIV.16b, v2.16b
  366. st1 {v6.16b}, [x1], #16
  367. .Lcbc_dec_end:
  368. /* store new IV */
  369. st1 {RIV.16b}, [x3]
  370. ret
  371. SYM_FUNC_END(sm4_neon_cbc_dec)
  372. .align 3
  373. SYM_FUNC_START(sm4_neon_ctr_crypt)
  374. /* input:
  375. * x0: round key array, CTX
  376. * x1: dst
  377. * x2: src
  378. * x3: ctr (big endian, 128 bit)
  379. * w4: nblocks
  380. */
  381. SM4_PREPARE()
  382. ldp x7, x8, [x3]
  383. rev x7, x7
  384. rev x8, x8
  385. .Lctr_crypt_loop_8x:
  386. sub w4, w4, #8
  387. tbnz w4, #31, .Lctr_crypt_4x
  388. #define inc_le128(vctr) \
  389. mov vctr.d[1], x8; \
  390. mov vctr.d[0], x7; \
  391. adds x8, x8, #1; \
  392. rev64 vctr.16b, vctr.16b; \
  393. adc x7, x7, xzr;
  394. /* construct CTRs */
  395. inc_le128(v0) /* +0 */
  396. inc_le128(v1) /* +1 */
  397. inc_le128(v2) /* +2 */
  398. inc_le128(v3) /* +3 */
  399. inc_le128(v4) /* +4 */
  400. inc_le128(v5) /* +5 */
  401. inc_le128(v6) /* +6 */
  402. inc_le128(v7) /* +7 */
  403. transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
  404. SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
  405. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
  406. ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
  407. eor v0.16b, v0.16b, RTMP0.16b
  408. eor v1.16b, v1.16b, RTMP1.16b
  409. eor v2.16b, v2.16b, RTMP2.16b
  410. eor v3.16b, v3.16b, RTMP3.16b
  411. eor v4.16b, v4.16b, RTMP4.16b
  412. eor v5.16b, v5.16b, RTMP5.16b
  413. eor v6.16b, v6.16b, RTMP6.16b
  414. eor v7.16b, v7.16b, RTMP7.16b
  415. st1 {v0.16b-v3.16b}, [x1], #64
  416. st1 {v4.16b-v7.16b}, [x1], #64
  417. cbz w4, .Lctr_crypt_end
  418. b .Lctr_crypt_loop_8x
  419. .Lctr_crypt_4x:
  420. add w4, w4, #8
  421. cmp w4, #4
  422. blt .Lctr_crypt_tail
  423. sub w4, w4, #4
  424. /* construct CTRs */
  425. inc_le128(v0) /* +0 */
  426. inc_le128(v1) /* +1 */
  427. inc_le128(v2) /* +2 */
  428. inc_le128(v3) /* +3 */
  429. ld1 {v4.16b-v7.16b}, [x2], #64
  430. transpose_4x4(v0, v1, v2, v3)
  431. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  432. eor v0.16b, v0.16b, v4.16b
  433. eor v1.16b, v1.16b, v5.16b
  434. eor v2.16b, v2.16b, v6.16b
  435. eor v3.16b, v3.16b, v7.16b
  436. st1 {v0.16b-v3.16b}, [x1], #64
  437. cbz w4, .Lctr_crypt_end
  438. .Lctr_crypt_tail:
  439. /* inc_le128 will change the sign bit */
  440. ld1 {v4.16b}, [x2], #16
  441. inc_le128(v0)
  442. cmp w4, #2
  443. blt .Lctr_crypt_tail_load_done
  444. ld1 {v5.16b}, [x2], #16
  445. inc_le128(v1)
  446. cmp w4, #2
  447. beq .Lctr_crypt_tail_load_done
  448. ld1 {v6.16b}, [x2], #16
  449. inc_le128(v2)
  450. .Lctr_crypt_tail_load_done:
  451. transpose_4x4(v0, v1, v2, v3)
  452. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  453. cmp w4, #2
  454. eor v0.16b, v0.16b, v4.16b
  455. st1 {v0.16b}, [x1], #16
  456. blt .Lctr_crypt_end
  457. eor v1.16b, v1.16b, v5.16b
  458. st1 {v1.16b}, [x1], #16
  459. beq .Lctr_crypt_end
  460. eor v2.16b, v2.16b, v6.16b
  461. st1 {v2.16b}, [x1], #16
  462. .Lctr_crypt_end:
  463. /* store new CTR */
  464. rev x7, x7
  465. rev x8, x8
  466. stp x7, x8, [x3]
  467. ret
  468. SYM_FUNC_END(sm4_neon_ctr_crypt)