aes-modes.S 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  4. *
  5. * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  6. */
  7. /* included by aes-ce.S and aes-neon.S */
  8. .text
  9. .align 4
  10. #ifndef MAX_STRIDE
  11. #define MAX_STRIDE 4
  12. #endif
  13. #if MAX_STRIDE == 4
  14. #define ST4(x...) x
  15. #define ST5(x...)
  16. #else
  17. #define ST4(x...)
  18. #define ST5(x...) x
  19. #endif
  20. SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
  21. encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  22. ret
  23. SYM_FUNC_END(aes_encrypt_block4x)
  24. SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
  25. decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  26. ret
  27. SYM_FUNC_END(aes_decrypt_block4x)
  28. #if MAX_STRIDE == 5
  29. SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
  30. encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  31. ret
  32. SYM_FUNC_END(aes_encrypt_block5x)
  33. SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
  34. decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  35. ret
  36. SYM_FUNC_END(aes_decrypt_block5x)
  37. #endif
  38. /*
  39. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  40. * int blocks)
  41. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  42. * int blocks)
  43. */
  44. AES_FUNC_START(aes_ecb_encrypt)
  45. frame_push 0
  46. enc_prepare w3, x2, x5
  47. .LecbencloopNx:
  48. subs w4, w4, #MAX_STRIDE
  49. bmi .Lecbenc1x
  50. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  51. ST4( bl aes_encrypt_block4x )
  52. ST5( ld1 {v4.16b}, [x1], #16 )
  53. ST5( bl aes_encrypt_block5x )
  54. st1 {v0.16b-v3.16b}, [x0], #64
  55. ST5( st1 {v4.16b}, [x0], #16 )
  56. b .LecbencloopNx
  57. .Lecbenc1x:
  58. adds w4, w4, #MAX_STRIDE
  59. beq .Lecbencout
  60. .Lecbencloop:
  61. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  62. encrypt_block v0, w3, x2, x5, w6
  63. st1 {v0.16b}, [x0], #16
  64. subs w4, w4, #1
  65. bne .Lecbencloop
  66. .Lecbencout:
  67. frame_pop
  68. ret
  69. AES_FUNC_END(aes_ecb_encrypt)
  70. AES_FUNC_START(aes_ecb_decrypt)
  71. frame_push 0
  72. dec_prepare w3, x2, x5
  73. .LecbdecloopNx:
  74. subs w4, w4, #MAX_STRIDE
  75. bmi .Lecbdec1x
  76. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  77. ST4( bl aes_decrypt_block4x )
  78. ST5( ld1 {v4.16b}, [x1], #16 )
  79. ST5( bl aes_decrypt_block5x )
  80. st1 {v0.16b-v3.16b}, [x0], #64
  81. ST5( st1 {v4.16b}, [x0], #16 )
  82. b .LecbdecloopNx
  83. .Lecbdec1x:
  84. adds w4, w4, #MAX_STRIDE
  85. beq .Lecbdecout
  86. .Lecbdecloop:
  87. ld1 {v0.16b}, [x1], #16 /* get next ct block */
  88. decrypt_block v0, w3, x2, x5, w6
  89. st1 {v0.16b}, [x0], #16
  90. subs w4, w4, #1
  91. bne .Lecbdecloop
  92. .Lecbdecout:
  93. frame_pop
  94. ret
  95. AES_FUNC_END(aes_ecb_decrypt)
  96. /*
  97. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  98. * int blocks, u8 iv[])
  99. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  100. * int blocks, u8 iv[])
  101. * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
  102. * int rounds, int blocks, u8 iv[],
  103. * u32 const rk2[]);
  104. * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
  105. * int rounds, int blocks, u8 iv[],
  106. * u32 const rk2[]);
  107. */
  108. AES_FUNC_START(aes_essiv_cbc_encrypt)
  109. ld1 {v4.16b}, [x5] /* get iv */
  110. mov w8, #14 /* AES-256: 14 rounds */
  111. enc_prepare w8, x6, x7
  112. encrypt_block v4, w8, x6, x7, w9
  113. enc_switch_key w3, x2, x6
  114. b .Lcbcencloop4x
  115. AES_FUNC_START(aes_cbc_encrypt)
  116. ld1 {v4.16b}, [x5] /* get iv */
  117. enc_prepare w3, x2, x6
  118. .Lcbcencloop4x:
  119. subs w4, w4, #4
  120. bmi .Lcbcenc1x
  121. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  122. eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
  123. encrypt_block v0, w3, x2, x6, w7
  124. eor v1.16b, v1.16b, v0.16b
  125. encrypt_block v1, w3, x2, x6, w7
  126. eor v2.16b, v2.16b, v1.16b
  127. encrypt_block v2, w3, x2, x6, w7
  128. eor v3.16b, v3.16b, v2.16b
  129. encrypt_block v3, w3, x2, x6, w7
  130. st1 {v0.16b-v3.16b}, [x0], #64
  131. mov v4.16b, v3.16b
  132. b .Lcbcencloop4x
  133. .Lcbcenc1x:
  134. adds w4, w4, #4
  135. beq .Lcbcencout
  136. .Lcbcencloop:
  137. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  138. eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
  139. encrypt_block v4, w3, x2, x6, w7
  140. st1 {v4.16b}, [x0], #16
  141. subs w4, w4, #1
  142. bne .Lcbcencloop
  143. .Lcbcencout:
  144. st1 {v4.16b}, [x5] /* return iv */
  145. ret
  146. AES_FUNC_END(aes_cbc_encrypt)
  147. AES_FUNC_END(aes_essiv_cbc_encrypt)
  148. AES_FUNC_START(aes_essiv_cbc_decrypt)
  149. ld1 {cbciv.16b}, [x5] /* get iv */
  150. mov w8, #14 /* AES-256: 14 rounds */
  151. enc_prepare w8, x6, x7
  152. encrypt_block cbciv, w8, x6, x7, w9
  153. b .Lessivcbcdecstart
  154. AES_FUNC_START(aes_cbc_decrypt)
  155. ld1 {cbciv.16b}, [x5] /* get iv */
  156. .Lessivcbcdecstart:
  157. frame_push 0
  158. dec_prepare w3, x2, x6
  159. .LcbcdecloopNx:
  160. subs w4, w4, #MAX_STRIDE
  161. bmi .Lcbcdec1x
  162. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  163. #if MAX_STRIDE == 5
  164. ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
  165. mov v5.16b, v0.16b
  166. mov v6.16b, v1.16b
  167. mov v7.16b, v2.16b
  168. bl aes_decrypt_block5x
  169. sub x1, x1, #32
  170. eor v0.16b, v0.16b, cbciv.16b
  171. eor v1.16b, v1.16b, v5.16b
  172. ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
  173. ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
  174. eor v2.16b, v2.16b, v6.16b
  175. eor v3.16b, v3.16b, v7.16b
  176. eor v4.16b, v4.16b, v5.16b
  177. #else
  178. mov v4.16b, v0.16b
  179. mov v5.16b, v1.16b
  180. mov v6.16b, v2.16b
  181. bl aes_decrypt_block4x
  182. sub x1, x1, #16
  183. eor v0.16b, v0.16b, cbciv.16b
  184. eor v1.16b, v1.16b, v4.16b
  185. ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
  186. eor v2.16b, v2.16b, v5.16b
  187. eor v3.16b, v3.16b, v6.16b
  188. #endif
  189. st1 {v0.16b-v3.16b}, [x0], #64
  190. ST5( st1 {v4.16b}, [x0], #16 )
  191. b .LcbcdecloopNx
  192. .Lcbcdec1x:
  193. adds w4, w4, #MAX_STRIDE
  194. beq .Lcbcdecout
  195. .Lcbcdecloop:
  196. ld1 {v1.16b}, [x1], #16 /* get next ct block */
  197. mov v0.16b, v1.16b /* ...and copy to v0 */
  198. decrypt_block v0, w3, x2, x6, w7
  199. eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
  200. mov cbciv.16b, v1.16b /* ct is next iv */
  201. st1 {v0.16b}, [x0], #16
  202. subs w4, w4, #1
  203. bne .Lcbcdecloop
  204. .Lcbcdecout:
  205. st1 {cbciv.16b}, [x5] /* return iv */
  206. frame_pop
  207. ret
  208. AES_FUNC_END(aes_cbc_decrypt)
  209. AES_FUNC_END(aes_essiv_cbc_decrypt)
  210. /*
  211. * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
  212. * int rounds, int bytes, u8 const iv[])
  213. * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
  214. * int rounds, int bytes, u8 const iv[])
  215. */
  216. AES_FUNC_START(aes_cbc_cts_encrypt)
  217. adr_l x8, .Lcts_permute_table
  218. sub x4, x4, #16
  219. add x9, x8, #32
  220. add x8, x8, x4
  221. sub x9, x9, x4
  222. ld1 {v3.16b}, [x8]
  223. ld1 {v4.16b}, [x9]
  224. ld1 {v0.16b}, [x1], x4 /* overlapping loads */
  225. ld1 {v1.16b}, [x1]
  226. ld1 {v5.16b}, [x5] /* get iv */
  227. enc_prepare w3, x2, x6
  228. eor v0.16b, v0.16b, v5.16b /* xor with iv */
  229. tbl v1.16b, {v1.16b}, v4.16b
  230. encrypt_block v0, w3, x2, x6, w7
  231. eor v1.16b, v1.16b, v0.16b
  232. tbl v0.16b, {v0.16b}, v3.16b
  233. encrypt_block v1, w3, x2, x6, w7
  234. add x4, x0, x4
  235. st1 {v0.16b}, [x4] /* overlapping stores */
  236. st1 {v1.16b}, [x0]
  237. ret
  238. AES_FUNC_END(aes_cbc_cts_encrypt)
  239. AES_FUNC_START(aes_cbc_cts_decrypt)
  240. adr_l x8, .Lcts_permute_table
  241. sub x4, x4, #16
  242. add x9, x8, #32
  243. add x8, x8, x4
  244. sub x9, x9, x4
  245. ld1 {v3.16b}, [x8]
  246. ld1 {v4.16b}, [x9]
  247. ld1 {v0.16b}, [x1], x4 /* overlapping loads */
  248. ld1 {v1.16b}, [x1]
  249. ld1 {v5.16b}, [x5] /* get iv */
  250. dec_prepare w3, x2, x6
  251. decrypt_block v0, w3, x2, x6, w7
  252. tbl v2.16b, {v0.16b}, v3.16b
  253. eor v2.16b, v2.16b, v1.16b
  254. tbx v0.16b, {v1.16b}, v4.16b
  255. decrypt_block v0, w3, x2, x6, w7
  256. eor v0.16b, v0.16b, v5.16b /* xor with iv */
  257. add x4, x0, x4
  258. st1 {v2.16b}, [x4] /* overlapping stores */
  259. st1 {v0.16b}, [x0]
  260. ret
  261. AES_FUNC_END(aes_cbc_cts_decrypt)
  262. .section ".rodata", "a"
  263. .align 6
  264. .Lcts_permute_table:
  265. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  266. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  267. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  268. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
  269. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  270. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  271. .previous
  272. /*
  273. * This macro generates the code for CTR and XCTR mode.
  274. */
  275. .macro ctr_encrypt xctr
  276. // Arguments
  277. OUT .req x0
  278. IN .req x1
  279. KEY .req x2
  280. ROUNDS_W .req w3
  281. BYTES_W .req w4
  282. IV .req x5
  283. BYTE_CTR_W .req w6 // XCTR only
  284. // Intermediate values
  285. CTR_W .req w11 // XCTR only
  286. CTR .req x11 // XCTR only
  287. IV_PART .req x12
  288. BLOCKS .req x13
  289. BLOCKS_W .req w13
  290. frame_push 0
  291. enc_prepare ROUNDS_W, KEY, IV_PART
  292. ld1 {vctr.16b}, [IV]
  293. /*
  294. * Keep 64 bits of the IV in a register. For CTR mode this lets us
  295. * easily increment the IV. For XCTR mode this lets us efficiently XOR
  296. * the 64-bit counter with the IV.
  297. */
  298. .if \xctr
  299. umov IV_PART, vctr.d[0]
  300. lsr CTR_W, BYTE_CTR_W, #4
  301. .else
  302. umov IV_PART, vctr.d[1]
  303. rev IV_PART, IV_PART
  304. .endif
  305. .LctrloopNx\xctr:
  306. add BLOCKS_W, BYTES_W, #15
  307. sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
  308. lsr BLOCKS_W, BLOCKS_W, #4
  309. mov w8, #MAX_STRIDE
  310. cmp BLOCKS_W, w8
  311. csel BLOCKS_W, BLOCKS_W, w8, lt
  312. /*
  313. * Set up the counter values in v0-v{MAX_STRIDE-1}.
  314. *
  315. * If we are encrypting less than MAX_STRIDE blocks, the tail block
  316. * handling code expects the last keystream block to be in
  317. * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
  318. * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
  319. */
  320. .if \xctr
  321. add CTR, CTR, BLOCKS
  322. .else
  323. adds IV_PART, IV_PART, BLOCKS
  324. .endif
  325. mov v0.16b, vctr.16b
  326. mov v1.16b, vctr.16b
  327. mov v2.16b, vctr.16b
  328. mov v3.16b, vctr.16b
  329. ST5( mov v4.16b, vctr.16b )
  330. .if \xctr
  331. sub x6, CTR, #MAX_STRIDE - 1
  332. sub x7, CTR, #MAX_STRIDE - 2
  333. sub x8, CTR, #MAX_STRIDE - 3
  334. sub x9, CTR, #MAX_STRIDE - 4
  335. ST5( sub x10, CTR, #MAX_STRIDE - 5 )
  336. eor x6, x6, IV_PART
  337. eor x7, x7, IV_PART
  338. eor x8, x8, IV_PART
  339. eor x9, x9, IV_PART
  340. ST5( eor x10, x10, IV_PART )
  341. mov v0.d[0], x6
  342. mov v1.d[0], x7
  343. mov v2.d[0], x8
  344. mov v3.d[0], x9
  345. ST5( mov v4.d[0], x10 )
  346. .else
  347. bcs 0f
  348. .subsection 1
  349. /*
  350. * This subsection handles carries.
  351. *
  352. * Conditional branching here is allowed with respect to time
  353. * invariance since the branches are dependent on the IV instead
  354. * of the plaintext or key. This code is rarely executed in
  355. * practice anyway.
  356. */
  357. /* Apply carry to outgoing counter. */
  358. 0: umov x8, vctr.d[0]
  359. rev x8, x8
  360. add x8, x8, #1
  361. rev x8, x8
  362. ins vctr.d[0], x8
  363. /*
  364. * Apply carry to counter blocks if needed.
  365. *
  366. * Since the carry flag was set, we know 0 <= IV_PART <
  367. * MAX_STRIDE. Using the value of IV_PART we can determine how
  368. * many counter blocks need to be updated.
  369. */
  370. cbz IV_PART, 2f
  371. adr x16, 1f
  372. sub x16, x16, IV_PART, lsl #3
  373. br x16
  374. bti c
  375. mov v0.d[0], vctr.d[0]
  376. bti c
  377. mov v1.d[0], vctr.d[0]
  378. bti c
  379. mov v2.d[0], vctr.d[0]
  380. bti c
  381. mov v3.d[0], vctr.d[0]
  382. ST5( bti c )
  383. ST5( mov v4.d[0], vctr.d[0] )
  384. 1: b 2f
  385. .previous
  386. 2: rev x7, IV_PART
  387. ins vctr.d[1], x7
  388. sub x7, IV_PART, #MAX_STRIDE - 1
  389. sub x8, IV_PART, #MAX_STRIDE - 2
  390. sub x9, IV_PART, #MAX_STRIDE - 3
  391. rev x7, x7
  392. rev x8, x8
  393. mov v1.d[1], x7
  394. rev x9, x9
  395. ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
  396. mov v2.d[1], x8
  397. ST5( rev x10, x10 )
  398. mov v3.d[1], x9
  399. ST5( mov v4.d[1], x10 )
  400. .endif
  401. /*
  402. * If there are at least MAX_STRIDE blocks left, XOR the data with
  403. * keystream and store. Otherwise jump to tail handling.
  404. */
  405. tbnz BYTES_W, #31, .Lctrtail\xctr
  406. ld1 {v5.16b-v7.16b}, [IN], #48
  407. ST4( bl aes_encrypt_block4x )
  408. ST5( bl aes_encrypt_block5x )
  409. eor v0.16b, v5.16b, v0.16b
  410. ST4( ld1 {v5.16b}, [IN], #16 )
  411. eor v1.16b, v6.16b, v1.16b
  412. ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
  413. eor v2.16b, v7.16b, v2.16b
  414. eor v3.16b, v5.16b, v3.16b
  415. ST5( eor v4.16b, v6.16b, v4.16b )
  416. st1 {v0.16b-v3.16b}, [OUT], #64
  417. ST5( st1 {v4.16b}, [OUT], #16 )
  418. cbz BYTES_W, .Lctrout\xctr
  419. b .LctrloopNx\xctr
  420. .Lctrout\xctr:
  421. .if !\xctr
  422. st1 {vctr.16b}, [IV] /* return next CTR value */
  423. .endif
  424. frame_pop
  425. ret
  426. .Lctrtail\xctr:
  427. /*
  428. * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
  429. *
  430. * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
  431. * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
  432. * v4 should have the next two counter blocks.
  433. *
  434. * This allows us to store the ciphertext by writing to overlapping
  435. * regions of memory. Any invalid ciphertext blocks get overwritten by
  436. * correctly computed blocks. This approach greatly simplifies the
  437. * logic for storing the ciphertext.
  438. */
  439. mov x16, #16
  440. ands w7, BYTES_W, #0xf
  441. csel x13, x7, x16, ne
  442. ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
  443. ST5( csel x14, x16, xzr, gt )
  444. cmp BYTES_W, #48 - (MAX_STRIDE << 4)
  445. csel x15, x16, xzr, gt
  446. cmp BYTES_W, #32 - (MAX_STRIDE << 4)
  447. csel x16, x16, xzr, gt
  448. cmp BYTES_W, #16 - (MAX_STRIDE << 4)
  449. adr_l x9, .Lcts_permute_table
  450. add x9, x9, x13
  451. ble .Lctrtail1x\xctr
  452. ST5( ld1 {v5.16b}, [IN], x14 )
  453. ld1 {v6.16b}, [IN], x15
  454. ld1 {v7.16b}, [IN], x16
  455. ST4( bl aes_encrypt_block4x )
  456. ST5( bl aes_encrypt_block5x )
  457. ld1 {v8.16b}, [IN], x13
  458. ld1 {v9.16b}, [IN]
  459. ld1 {v10.16b}, [x9]
  460. ST4( eor v6.16b, v6.16b, v0.16b )
  461. ST4( eor v7.16b, v7.16b, v1.16b )
  462. ST4( tbl v3.16b, {v3.16b}, v10.16b )
  463. ST4( eor v8.16b, v8.16b, v2.16b )
  464. ST4( eor v9.16b, v9.16b, v3.16b )
  465. ST5( eor v5.16b, v5.16b, v0.16b )
  466. ST5( eor v6.16b, v6.16b, v1.16b )
  467. ST5( tbl v4.16b, {v4.16b}, v10.16b )
  468. ST5( eor v7.16b, v7.16b, v2.16b )
  469. ST5( eor v8.16b, v8.16b, v3.16b )
  470. ST5( eor v9.16b, v9.16b, v4.16b )
  471. ST5( st1 {v5.16b}, [OUT], x14 )
  472. st1 {v6.16b}, [OUT], x15
  473. st1 {v7.16b}, [OUT], x16
  474. add x13, x13, OUT
  475. st1 {v9.16b}, [x13] // overlapping stores
  476. st1 {v8.16b}, [OUT]
  477. b .Lctrout\xctr
  478. .Lctrtail1x\xctr:
  479. /*
  480. * Handle <= 16 bytes of plaintext
  481. *
  482. * This code always reads and writes 16 bytes. To avoid out of bounds
  483. * accesses, XCTR and CTR modes must use a temporary buffer when
  484. * encrypting/decrypting less than 16 bytes.
  485. *
  486. * This code is unusual in that it loads the input and stores the output
  487. * relative to the end of the buffers rather than relative to the start.
  488. * This causes unusual behaviour when encrypting/decrypting less than 16
  489. * bytes; the end of the data is expected to be at the end of the
  490. * temporary buffer rather than the start of the data being at the start
  491. * of the temporary buffer.
  492. */
  493. sub x8, x7, #16
  494. csel x7, x7, x8, eq
  495. add IN, IN, x7
  496. add OUT, OUT, x7
  497. ld1 {v5.16b}, [IN]
  498. ld1 {v6.16b}, [OUT]
  499. ST5( mov v3.16b, v4.16b )
  500. encrypt_block v3, ROUNDS_W, KEY, x8, w7
  501. ld1 {v10.16b-v11.16b}, [x9]
  502. tbl v3.16b, {v3.16b}, v10.16b
  503. sshr v11.16b, v11.16b, #7
  504. eor v5.16b, v5.16b, v3.16b
  505. bif v5.16b, v6.16b, v11.16b
  506. st1 {v5.16b}, [OUT]
  507. b .Lctrout\xctr
  508. // Arguments
  509. .unreq OUT
  510. .unreq IN
  511. .unreq KEY
  512. .unreq ROUNDS_W
  513. .unreq BYTES_W
  514. .unreq IV
  515. .unreq BYTE_CTR_W // XCTR only
  516. // Intermediate values
  517. .unreq CTR_W // XCTR only
  518. .unreq CTR // XCTR only
  519. .unreq IV_PART
  520. .unreq BLOCKS
  521. .unreq BLOCKS_W
  522. .endm
  523. /*
  524. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  525. * int bytes, u8 ctr[])
  526. *
  527. * The input and output buffers must always be at least 16 bytes even if
  528. * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
  529. * accesses will occur. The data to be encrypted/decrypted is expected
  530. * to be at the end of this 16-byte temporary buffer rather than the
  531. * start.
  532. */
  533. AES_FUNC_START(aes_ctr_encrypt)
  534. ctr_encrypt 0
  535. AES_FUNC_END(aes_ctr_encrypt)
  536. /*
  537. * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  538. * int bytes, u8 const iv[], int byte_ctr)
  539. *
  540. * The input and output buffers must always be at least 16 bytes even if
  541. * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
  542. * accesses will occur. The data to be encrypted/decrypted is expected
  543. * to be at the end of this 16-byte temporary buffer rather than the
  544. * start.
  545. */
  546. AES_FUNC_START(aes_xctr_encrypt)
  547. ctr_encrypt 1
  548. AES_FUNC_END(aes_xctr_encrypt)
  549. /*
  550. * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  551. * int bytes, u8 const rk2[], u8 iv[], int first)
  552. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  553. * int bytes, u8 const rk2[], u8 iv[], int first)
  554. */
  555. .macro next_tweak, out, in, tmp
  556. sshr \tmp\().2d, \in\().2d, #63
  557. and \tmp\().16b, \tmp\().16b, xtsmask.16b
  558. add \out\().2d, \in\().2d, \in\().2d
  559. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  560. eor \out\().16b, \out\().16b, \tmp\().16b
  561. .endm
  562. .macro xts_load_mask, tmp
  563. movi xtsmask.2s, #0x1
  564. movi \tmp\().2s, #0x87
  565. uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
  566. .endm
  567. AES_FUNC_START(aes_xts_encrypt)
  568. frame_push 0
  569. ld1 {v4.16b}, [x6]
  570. xts_load_mask v8
  571. cbz w7, .Lxtsencnotfirst
  572. enc_prepare w3, x5, x8
  573. xts_cts_skip_tw w7, .LxtsencNx
  574. encrypt_block v4, w3, x5, x8, w7 /* first tweak */
  575. enc_switch_key w3, x2, x8
  576. b .LxtsencNx
  577. .Lxtsencnotfirst:
  578. enc_prepare w3, x2, x8
  579. .LxtsencloopNx:
  580. next_tweak v4, v4, v8
  581. .LxtsencNx:
  582. subs w4, w4, #64
  583. bmi .Lxtsenc1x
  584. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  585. next_tweak v5, v4, v8
  586. eor v0.16b, v0.16b, v4.16b
  587. next_tweak v6, v5, v8
  588. eor v1.16b, v1.16b, v5.16b
  589. eor v2.16b, v2.16b, v6.16b
  590. next_tweak v7, v6, v8
  591. eor v3.16b, v3.16b, v7.16b
  592. bl aes_encrypt_block4x
  593. eor v3.16b, v3.16b, v7.16b
  594. eor v0.16b, v0.16b, v4.16b
  595. eor v1.16b, v1.16b, v5.16b
  596. eor v2.16b, v2.16b, v6.16b
  597. st1 {v0.16b-v3.16b}, [x0], #64
  598. mov v4.16b, v7.16b
  599. cbz w4, .Lxtsencret
  600. xts_reload_mask v8
  601. b .LxtsencloopNx
  602. .Lxtsenc1x:
  603. adds w4, w4, #64
  604. beq .Lxtsencout
  605. subs w4, w4, #16
  606. bmi .LxtsencctsNx
  607. .Lxtsencloop:
  608. ld1 {v0.16b}, [x1], #16
  609. .Lxtsencctsout:
  610. eor v0.16b, v0.16b, v4.16b
  611. encrypt_block v0, w3, x2, x8, w7
  612. eor v0.16b, v0.16b, v4.16b
  613. cbz w4, .Lxtsencout
  614. subs w4, w4, #16
  615. next_tweak v4, v4, v8
  616. bmi .Lxtsenccts
  617. st1 {v0.16b}, [x0], #16
  618. b .Lxtsencloop
  619. .Lxtsencout:
  620. st1 {v0.16b}, [x0]
  621. .Lxtsencret:
  622. st1 {v4.16b}, [x6]
  623. frame_pop
  624. ret
  625. .LxtsencctsNx:
  626. mov v0.16b, v3.16b
  627. sub x0, x0, #16
  628. .Lxtsenccts:
  629. adr_l x8, .Lcts_permute_table
  630. add x1, x1, w4, sxtw /* rewind input pointer */
  631. add w4, w4, #16 /* # bytes in final block */
  632. add x9, x8, #32
  633. add x8, x8, x4
  634. sub x9, x9, x4
  635. add x4, x0, x4 /* output address of final block */
  636. ld1 {v1.16b}, [x1] /* load final block */
  637. ld1 {v2.16b}, [x8]
  638. ld1 {v3.16b}, [x9]
  639. tbl v2.16b, {v0.16b}, v2.16b
  640. tbx v0.16b, {v1.16b}, v3.16b
  641. st1 {v2.16b}, [x4] /* overlapping stores */
  642. mov w4, wzr
  643. b .Lxtsencctsout
  644. AES_FUNC_END(aes_xts_encrypt)
  645. AES_FUNC_START(aes_xts_decrypt)
  646. frame_push 0
  647. /* subtract 16 bytes if we are doing CTS */
  648. sub w8, w4, #0x10
  649. tst w4, #0xf
  650. csel w4, w4, w8, eq
  651. ld1 {v4.16b}, [x6]
  652. xts_load_mask v8
  653. xts_cts_skip_tw w7, .Lxtsdecskiptw
  654. cbz w7, .Lxtsdecnotfirst
  655. enc_prepare w3, x5, x8
  656. encrypt_block v4, w3, x5, x8, w7 /* first tweak */
  657. .Lxtsdecskiptw:
  658. dec_prepare w3, x2, x8
  659. b .LxtsdecNx
  660. .Lxtsdecnotfirst:
  661. dec_prepare w3, x2, x8
  662. .LxtsdecloopNx:
  663. next_tweak v4, v4, v8
  664. .LxtsdecNx:
  665. subs w4, w4, #64
  666. bmi .Lxtsdec1x
  667. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  668. next_tweak v5, v4, v8
  669. eor v0.16b, v0.16b, v4.16b
  670. next_tweak v6, v5, v8
  671. eor v1.16b, v1.16b, v5.16b
  672. eor v2.16b, v2.16b, v6.16b
  673. next_tweak v7, v6, v8
  674. eor v3.16b, v3.16b, v7.16b
  675. bl aes_decrypt_block4x
  676. eor v3.16b, v3.16b, v7.16b
  677. eor v0.16b, v0.16b, v4.16b
  678. eor v1.16b, v1.16b, v5.16b
  679. eor v2.16b, v2.16b, v6.16b
  680. st1 {v0.16b-v3.16b}, [x0], #64
  681. mov v4.16b, v7.16b
  682. cbz w4, .Lxtsdecout
  683. xts_reload_mask v8
  684. b .LxtsdecloopNx
  685. .Lxtsdec1x:
  686. adds w4, w4, #64
  687. beq .Lxtsdecout
  688. subs w4, w4, #16
  689. .Lxtsdecloop:
  690. ld1 {v0.16b}, [x1], #16
  691. bmi .Lxtsdeccts
  692. .Lxtsdecctsout:
  693. eor v0.16b, v0.16b, v4.16b
  694. decrypt_block v0, w3, x2, x8, w7
  695. eor v0.16b, v0.16b, v4.16b
  696. st1 {v0.16b}, [x0], #16
  697. cbz w4, .Lxtsdecout
  698. subs w4, w4, #16
  699. next_tweak v4, v4, v8
  700. b .Lxtsdecloop
  701. .Lxtsdecout:
  702. st1 {v4.16b}, [x6]
  703. frame_pop
  704. ret
  705. .Lxtsdeccts:
  706. adr_l x8, .Lcts_permute_table
  707. add x1, x1, w4, sxtw /* rewind input pointer */
  708. add w4, w4, #16 /* # bytes in final block */
  709. add x9, x8, #32
  710. add x8, x8, x4
  711. sub x9, x9, x4
  712. add x4, x0, x4 /* output address of final block */
  713. next_tweak v5, v4, v8
  714. ld1 {v1.16b}, [x1] /* load final block */
  715. ld1 {v2.16b}, [x8]
  716. ld1 {v3.16b}, [x9]
  717. eor v0.16b, v0.16b, v5.16b
  718. decrypt_block v0, w3, x2, x8, w7
  719. eor v0.16b, v0.16b, v5.16b
  720. tbl v2.16b, {v0.16b}, v2.16b
  721. tbx v0.16b, {v1.16b}, v3.16b
  722. st1 {v2.16b}, [x4] /* overlapping stores */
  723. mov w4, wzr
  724. b .Lxtsdecctsout
  725. AES_FUNC_END(aes_xts_decrypt)
  726. /*
  727. * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
  728. * int blocks, u8 dg[], int enc_before, int enc_after)
  729. */
  730. AES_FUNC_START(aes_mac_update)
  731. ld1 {v0.16b}, [x4] /* get dg */
  732. enc_prepare w2, x1, x7
  733. cbz w5, .Lmacloop4x
  734. encrypt_block v0, w2, x1, x7, w8
  735. .Lmacloop4x:
  736. subs w3, w3, #4
  737. bmi .Lmac1x
  738. ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
  739. eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
  740. encrypt_block v0, w2, x1, x7, w8
  741. eor v0.16b, v0.16b, v2.16b
  742. encrypt_block v0, w2, x1, x7, w8
  743. eor v0.16b, v0.16b, v3.16b
  744. encrypt_block v0, w2, x1, x7, w8
  745. eor v0.16b, v0.16b, v4.16b
  746. cmp w3, wzr
  747. csinv x5, x6, xzr, eq
  748. cbz w5, .Lmacout
  749. encrypt_block v0, w2, x1, x7, w8
  750. st1 {v0.16b}, [x4] /* return dg */
  751. cond_yield .Lmacout, x7, x8
  752. b .Lmacloop4x
  753. .Lmac1x:
  754. add w3, w3, #4
  755. .Lmacloop:
  756. cbz w3, .Lmacout
  757. ld1 {v1.16b}, [x0], #16 /* get next pt block */
  758. eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
  759. subs w3, w3, #1
  760. csinv x5, x6, xzr, eq
  761. cbz w5, .Lmacout
  762. .Lmacenc:
  763. encrypt_block v0, w2, x1, x7, w8
  764. b .Lmacloop
  765. .Lmacout:
  766. st1 {v0.16b}, [x4] /* return dg */
  767. mov w0, w3
  768. ret
  769. AES_FUNC_END(aes_mac_update)