aes-ce-core.S 11 KB


  1. /*
  2. * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
  3. *
  4. * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. .text
  13. .fpu crypto-neon-fp-armv8
  14. .align 3
  15. .macro enc_round, state, key
  16. aese.8 \state, \key
  17. aesmc.8 \state, \state
  18. .endm
  19. .macro dec_round, state, key
  20. aesd.8 \state, \key
  21. aesimc.8 \state, \state
  22. .endm
  23. .macro enc_dround, key1, key2
  24. enc_round q0, \key1
  25. enc_round q0, \key2
  26. .endm
  27. .macro dec_dround, key1, key2
  28. dec_round q0, \key1
  29. dec_round q0, \key2
  30. .endm
  31. .macro enc_fround, key1, key2, key3
  32. enc_round q0, \key1
  33. aese.8 q0, \key2
  34. veor q0, q0, \key3
  35. .endm
  36. .macro dec_fround, key1, key2, key3
  37. dec_round q0, \key1
  38. aesd.8 q0, \key2
  39. veor q0, q0, \key3
  40. .endm
  41. .macro enc_dround_3x, key1, key2
  42. enc_round q0, \key1
  43. enc_round q1, \key1
  44. enc_round q2, \key1
  45. enc_round q0, \key2
  46. enc_round q1, \key2
  47. enc_round q2, \key2
  48. .endm
  49. .macro dec_dround_3x, key1, key2
  50. dec_round q0, \key1
  51. dec_round q1, \key1
  52. dec_round q2, \key1
  53. dec_round q0, \key2
  54. dec_round q1, \key2
  55. dec_round q2, \key2
  56. .endm
  57. .macro enc_fround_3x, key1, key2, key3
  58. enc_round q0, \key1
  59. enc_round q1, \key1
  60. enc_round q2, \key1
  61. aese.8 q0, \key2
  62. aese.8 q1, \key2
  63. aese.8 q2, \key2
  64. veor q0, q0, \key3
  65. veor q1, q1, \key3
  66. veor q2, q2, \key3
  67. .endm
  68. .macro dec_fround_3x, key1, key2, key3
  69. dec_round q0, \key1
  70. dec_round q1, \key1
  71. dec_round q2, \key1
  72. aesd.8 q0, \key2
  73. aesd.8 q1, \key2
  74. aesd.8 q2, \key2
  75. veor q0, q0, \key3
  76. veor q1, q1, \key3
  77. veor q2, q2, \key3
  78. .endm
  79. .macro do_block, dround, fround
  80. cmp r3, #12 @ which key size?
  81. vld1.8 {q10-q11}, [ip]!
  82. \dround q8, q9
  83. vld1.8 {q12-q13}, [ip]!
  84. \dround q10, q11
  85. vld1.8 {q10-q11}, [ip]!
  86. \dround q12, q13
  87. vld1.8 {q12-q13}, [ip]!
  88. \dround q10, q11
  89. blo 0f @ AES-128: 10 rounds
  90. vld1.8 {q10-q11}, [ip]!
  91. \dround q12, q13
  92. beq 1f @ AES-192: 12 rounds
  93. vld1.8 {q12-q13}, [ip]
  94. \dround q10, q11
  95. 0: \fround q12, q13, q14
  96. bx lr
  97. 1: \fround q10, q11, q14
  98. bx lr
  99. .endm
  100. /*
  101. * Internal, non-AAPCS compliant functions that implement the core AES
  102. * transforms. These should preserve all registers except q0 - q2 and ip
  103. * Arguments:
  104. * q0 : first in/output block
  105. * q1 : second in/output block (_3x version only)
  106. * q2 : third in/output block (_3x version only)
  107. * q8 : first round key
  108. * q9 : secound round key
  109. * q14 : final round key
  110. * r2 : address of round key array
  111. * r3 : number of rounds
  112. */
  113. .align 6
  114. aes_encrypt:
  115. add ip, r2, #32 @ 3rd round key
  116. .Laes_encrypt_tweak:
  117. do_block enc_dround, enc_fround
  118. ENDPROC(aes_encrypt)
  119. .align 6
  120. aes_decrypt:
  121. add ip, r2, #32 @ 3rd round key
  122. do_block dec_dround, dec_fround
  123. ENDPROC(aes_decrypt)
  124. .align 6
  125. aes_encrypt_3x:
  126. add ip, r2, #32 @ 3rd round key
  127. do_block enc_dround_3x, enc_fround_3x
  128. ENDPROC(aes_encrypt_3x)
  129. .align 6
  130. aes_decrypt_3x:
  131. add ip, r2, #32 @ 3rd round key
  132. do_block dec_dround_3x, dec_fround_3x
  133. ENDPROC(aes_decrypt_3x)
  134. .macro prepare_key, rk, rounds
  135. add ip, \rk, \rounds, lsl #4
  136. vld1.8 {q8-q9}, [\rk] @ load first 2 round keys
  137. vld1.8 {q14}, [ip] @ load last round key
  138. .endm
  139. /*
  140. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  141. * int blocks)
  142. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  143. * int blocks)
  144. */
  145. ENTRY(ce_aes_ecb_encrypt)
  146. push {r4, lr}
  147. ldr r4, [sp, #8]
  148. prepare_key r2, r3
  149. .Lecbencloop3x:
  150. subs r4, r4, #3
  151. bmi .Lecbenc1x
  152. vld1.8 {q0-q1}, [r1]!
  153. vld1.8 {q2}, [r1]!
  154. bl aes_encrypt_3x
  155. vst1.8 {q0-q1}, [r0]!
  156. vst1.8 {q2}, [r0]!
  157. b .Lecbencloop3x
  158. .Lecbenc1x:
  159. adds r4, r4, #3
  160. beq .Lecbencout
  161. .Lecbencloop:
  162. vld1.8 {q0}, [r1]!
  163. bl aes_encrypt
  164. vst1.8 {q0}, [r0]!
  165. subs r4, r4, #1
  166. bne .Lecbencloop
  167. .Lecbencout:
  168. pop {r4, pc}
  169. ENDPROC(ce_aes_ecb_encrypt)
  170. ENTRY(ce_aes_ecb_decrypt)
  171. push {r4, lr}
  172. ldr r4, [sp, #8]
  173. prepare_key r2, r3
  174. .Lecbdecloop3x:
  175. subs r4, r4, #3
  176. bmi .Lecbdec1x
  177. vld1.8 {q0-q1}, [r1]!
  178. vld1.8 {q2}, [r1]!
  179. bl aes_decrypt_3x
  180. vst1.8 {q0-q1}, [r0]!
  181. vst1.8 {q2}, [r0]!
  182. b .Lecbdecloop3x
  183. .Lecbdec1x:
  184. adds r4, r4, #3
  185. beq .Lecbdecout
  186. .Lecbdecloop:
  187. vld1.8 {q0}, [r1]!
  188. bl aes_decrypt
  189. vst1.8 {q0}, [r0]!
  190. subs r4, r4, #1
  191. bne .Lecbdecloop
  192. .Lecbdecout:
  193. pop {r4, pc}
  194. ENDPROC(ce_aes_ecb_decrypt)
  195. /*
  196. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  197. * int blocks, u8 iv[])
  198. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  199. * int blocks, u8 iv[])
  200. */
  201. ENTRY(ce_aes_cbc_encrypt)
  202. push {r4-r6, lr}
  203. ldrd r4, r5, [sp, #16]
  204. vld1.8 {q0}, [r5]
  205. prepare_key r2, r3
  206. .Lcbcencloop:
  207. vld1.8 {q1}, [r1]! @ get next pt block
  208. veor q0, q0, q1 @ ..and xor with iv
  209. bl aes_encrypt
  210. vst1.8 {q0}, [r0]!
  211. subs r4, r4, #1
  212. bne .Lcbcencloop
  213. vst1.8 {q0}, [r5]
  214. pop {r4-r6, pc}
  215. ENDPROC(ce_aes_cbc_encrypt)
  216. ENTRY(ce_aes_cbc_decrypt)
  217. push {r4-r6, lr}
  218. ldrd r4, r5, [sp, #16]
  219. vld1.8 {q6}, [r5] @ keep iv in q6
  220. prepare_key r2, r3
  221. .Lcbcdecloop3x:
  222. subs r4, r4, #3
  223. bmi .Lcbcdec1x
  224. vld1.8 {q0-q1}, [r1]!
  225. vld1.8 {q2}, [r1]!
  226. vmov q3, q0
  227. vmov q4, q1
  228. vmov q5, q2
  229. bl aes_decrypt_3x
  230. veor q0, q0, q6
  231. veor q1, q1, q3
  232. veor q2, q2, q4
  233. vmov q6, q5
  234. vst1.8 {q0-q1}, [r0]!
  235. vst1.8 {q2}, [r0]!
  236. b .Lcbcdecloop3x
  237. .Lcbcdec1x:
  238. adds r4, r4, #3
  239. beq .Lcbcdecout
  240. vmov q15, q14 @ preserve last round key
  241. .Lcbcdecloop:
  242. vld1.8 {q0}, [r1]! @ get next ct block
  243. veor q14, q15, q6 @ combine prev ct with last key
  244. vmov q6, q0
  245. bl aes_decrypt
  246. vst1.8 {q0}, [r0]!
  247. subs r4, r4, #1
  248. bne .Lcbcdecloop
  249. .Lcbcdecout:
  250. vst1.8 {q6}, [r5] @ keep iv in q6
  251. pop {r4-r6, pc}
  252. ENDPROC(ce_aes_cbc_decrypt)
  253. /*
  254. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  255. * int blocks, u8 ctr[])
  256. */
  257. ENTRY(ce_aes_ctr_encrypt)
  258. push {r4-r6, lr}
  259. ldrd r4, r5, [sp, #16]
  260. vld1.8 {q6}, [r5] @ load ctr
  261. prepare_key r2, r3
  262. vmov r6, s27 @ keep swabbed ctr in r6
  263. rev r6, r6
  264. cmn r6, r4 @ 32 bit overflow?
  265. bcs .Lctrloop
  266. .Lctrloop3x:
  267. subs r4, r4, #3
  268. bmi .Lctr1x
  269. add r6, r6, #1
  270. vmov q0, q6
  271. vmov q1, q6
  272. rev ip, r6
  273. add r6, r6, #1
  274. vmov q2, q6
  275. vmov s7, ip
  276. rev ip, r6
  277. add r6, r6, #1
  278. vmov s11, ip
  279. vld1.8 {q3-q4}, [r1]!
  280. vld1.8 {q5}, [r1]!
  281. bl aes_encrypt_3x
  282. veor q0, q0, q3
  283. veor q1, q1, q4
  284. veor q2, q2, q5
  285. rev ip, r6
  286. vst1.8 {q0-q1}, [r0]!
  287. vst1.8 {q2}, [r0]!
  288. vmov s27, ip
  289. b .Lctrloop3x
  290. .Lctr1x:
  291. adds r4, r4, #3
  292. beq .Lctrout
  293. .Lctrloop:
  294. vmov q0, q6
  295. bl aes_encrypt
  296. subs r4, r4, #1
  297. bmi .Lctrtailblock @ blocks < 0 means tail block
  298. vld1.8 {q3}, [r1]!
  299. veor q3, q0, q3
  300. vst1.8 {q3}, [r0]!
  301. adds r6, r6, #1 @ increment BE ctr
  302. rev ip, r6
  303. vmov s27, ip
  304. bcs .Lctrcarry
  305. teq r4, #0
  306. bne .Lctrloop
  307. .Lctrout:
  308. vst1.8 {q6}, [r5]
  309. pop {r4-r6, pc}
  310. .Lctrtailblock:
  311. vst1.8 {q0}, [r0, :64] @ return just the key stream
  312. pop {r4-r6, pc}
  313. .Lctrcarry:
  314. .irp sreg, s26, s25, s24
  315. vmov ip, \sreg @ load next word of ctr
  316. rev ip, ip @ ... to handle the carry
  317. adds ip, ip, #1
  318. rev ip, ip
  319. vmov \sreg, ip
  320. bcc 0f
  321. .endr
  322. 0: teq r4, #0
  323. beq .Lctrout
  324. b .Lctrloop
  325. ENDPROC(ce_aes_ctr_encrypt)
  326. /*
  327. * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  328. * int blocks, u8 iv[], u8 const rk2[], int first)
  329. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  330. * int blocks, u8 iv[], u8 const rk2[], int first)
  331. */
  332. .macro next_tweak, out, in, const, tmp
  333. vshr.s64 \tmp, \in, #63
  334. vand \tmp, \tmp, \const
  335. vadd.u64 \out, \in, \in
  336. vext.8 \tmp, \tmp, \tmp, #8
  337. veor \out, \out, \tmp
  338. .endm
  339. .align 3
  340. .Lxts_mul_x:
  341. .quad 1, 0x87
  342. ce_aes_xts_init:
  343. vldr d14, .Lxts_mul_x
  344. vldr d15, .Lxts_mul_x + 8
  345. ldrd r4, r5, [sp, #16] @ load args
  346. ldr r6, [sp, #28]
  347. vld1.8 {q0}, [r5] @ load iv
  348. teq r6, #1 @ start of a block?
  349. bxne lr
  350. @ Encrypt the IV in q0 with the second AES key. This should only
  351. @ be done at the start of a block.
  352. ldr r6, [sp, #24] @ load AES key 2
  353. prepare_key r6, r3
  354. add ip, r6, #32 @ 3rd round key of key 2
  355. b .Laes_encrypt_tweak @ tail call
  356. ENDPROC(ce_aes_xts_init)
  357. ENTRY(ce_aes_xts_encrypt)
  358. push {r4-r6, lr}
  359. bl ce_aes_xts_init @ run shared prologue
  360. prepare_key r2, r3
  361. vmov q3, q0
  362. teq r6, #0 @ start of a block?
  363. bne .Lxtsenc3x
  364. .Lxtsencloop3x:
  365. next_tweak q3, q3, q7, q6
  366. .Lxtsenc3x:
  367. subs r4, r4, #3
  368. bmi .Lxtsenc1x
  369. vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks
  370. vld1.8 {q2}, [r1]!
  371. next_tweak q4, q3, q7, q6
  372. veor q0, q0, q3
  373. next_tweak q5, q4, q7, q6
  374. veor q1, q1, q4
  375. veor q2, q2, q5
  376. bl aes_encrypt_3x
  377. veor q0, q0, q3
  378. veor q1, q1, q4
  379. veor q2, q2, q5
  380. vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks
  381. vst1.8 {q2}, [r0]!
  382. vmov q3, q5
  383. teq r4, #0
  384. beq .Lxtsencout
  385. b .Lxtsencloop3x
  386. .Lxtsenc1x:
  387. adds r4, r4, #3
  388. beq .Lxtsencout
  389. .Lxtsencloop:
  390. vld1.8 {q0}, [r1]!
  391. veor q0, q0, q3
  392. bl aes_encrypt
  393. veor q0, q0, q3
  394. vst1.8 {q0}, [r0]!
  395. subs r4, r4, #1
  396. beq .Lxtsencout
  397. next_tweak q3, q3, q7, q6
  398. b .Lxtsencloop
  399. .Lxtsencout:
  400. vst1.8 {q3}, [r5]
  401. pop {r4-r6, pc}
  402. ENDPROC(ce_aes_xts_encrypt)
  403. ENTRY(ce_aes_xts_decrypt)
  404. push {r4-r6, lr}
  405. bl ce_aes_xts_init @ run shared prologue
  406. prepare_key r2, r3
  407. vmov q3, q0
  408. teq r6, #0 @ start of a block?
  409. bne .Lxtsdec3x
  410. .Lxtsdecloop3x:
  411. next_tweak q3, q3, q7, q6
  412. .Lxtsdec3x:
  413. subs r4, r4, #3
  414. bmi .Lxtsdec1x
  415. vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks
  416. vld1.8 {q2}, [r1]!
  417. next_tweak q4, q3, q7, q6
  418. veor q0, q0, q3
  419. next_tweak q5, q4, q7, q6
  420. veor q1, q1, q4
  421. veor q2, q2, q5
  422. bl aes_decrypt_3x
  423. veor q0, q0, q3
  424. veor q1, q1, q4
  425. veor q2, q2, q5
  426. vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks
  427. vst1.8 {q2}, [r0]!
  428. vmov q3, q5
  429. teq r4, #0
  430. beq .Lxtsdecout
  431. b .Lxtsdecloop3x
  432. .Lxtsdec1x:
  433. adds r4, r4, #3
  434. beq .Lxtsdecout
  435. .Lxtsdecloop:
  436. vld1.8 {q0}, [r1]!
  437. veor q0, q0, q3
  438. add ip, r2, #32 @ 3rd round key
  439. bl aes_decrypt
  440. veor q0, q0, q3
  441. vst1.8 {q0}, [r0]!
  442. subs r4, r4, #1
  443. beq .Lxtsdecout
  444. next_tweak q3, q3, q7, q6
  445. b .Lxtsdecloop
  446. .Lxtsdecout:
  447. vst1.8 {q3}, [r5]
  448. pop {r4-r6, pc}
  449. ENDPROC(ce_aes_xts_decrypt)
  450. /*
  451. * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
  452. * AES sbox substitution on each byte in
  453. * 'input'
  454. */
  455. ENTRY(ce_aes_sub)
  456. vdup.32 q1, r0
  457. veor q0, q0, q0
  458. aese.8 q0, q1
  459. vmov r0, s0
  460. bx lr
  461. ENDPROC(ce_aes_sub)
  462. /*
  463. * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
  464. * operation on round key *src
  465. */
  466. ENTRY(ce_aes_invert)
  467. vld1.8 {q0}, [r1]
  468. aesimc.8 q0, q0
  469. vst1.8 {q0}, [r0]
  470. bx lr
  471. ENDPROC(ce_aes_invert)