aes-neon.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. /*
  2. * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
  3. *
  4. * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. #define AES_ENTRY(func) ENTRY(neon_ ## func)
  13. #define AES_ENDPROC(func) ENDPROC(neon_ ## func)
  14. /* multiply by polynomial 'x' in GF(2^8) */
  15. .macro mul_by_x, out, in, temp, const
  16. sshr \temp, \in, #7
  17. shl \out, \in, #1
  18. and \temp, \temp, \const
  19. eor \out, \out, \temp
  20. .endm
  21. /* multiply by polynomial 'x^2' in GF(2^8) */
  22. .macro mul_by_x2, out, in, temp, const
  23. ushr \temp, \in, #6
  24. shl \out, \in, #2
  25. pmul \temp, \temp, \const
  26. eor \out, \out, \temp
  27. .endm
  28. /* preload the entire Sbox */
  29. .macro prepare, sbox, shiftrows, temp
  30. movi v12.16b, #0x1b
  31. ldr_l q13, \shiftrows, \temp
  32. ldr_l q14, .Lror32by8, \temp
  33. adr_l \temp, \sbox
  34. ld1 {v16.16b-v19.16b}, [\temp], #64
  35. ld1 {v20.16b-v23.16b}, [\temp], #64
  36. ld1 {v24.16b-v27.16b}, [\temp], #64
  37. ld1 {v28.16b-v31.16b}, [\temp]
  38. .endm
  39. /* do preload for encryption */
  40. .macro enc_prepare, ignore0, ignore1, temp
  41. prepare .LForward_Sbox, .LForward_ShiftRows, \temp
  42. .endm
  43. .macro enc_switch_key, ignore0, ignore1, temp
  44. /* do nothing */
  45. .endm
  46. /* do preload for decryption */
  47. .macro dec_prepare, ignore0, ignore1, temp
  48. prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
  49. .endm
  50. /* apply SubBytes transformation using the the preloaded Sbox */
  51. .macro sub_bytes, in
  52. sub v9.16b, \in\().16b, v15.16b
  53. tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
  54. sub v10.16b, v9.16b, v15.16b
  55. tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
  56. sub v11.16b, v10.16b, v15.16b
  57. tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
  58. tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
  59. .endm
  60. /* apply MixColumns transformation */
  61. .macro mix_columns, in, enc
  62. .if \enc == 0
  63. /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
  64. mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b
  65. eor \in\().16b, \in\().16b, v8.16b
  66. rev32 v8.8h, v8.8h
  67. eor \in\().16b, \in\().16b, v8.16b
  68. .endif
  69. mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b
  70. rev32 v8.8h, \in\().8h
  71. eor v8.16b, v8.16b, v9.16b
  72. eor \in\().16b, \in\().16b, v8.16b
  73. tbl \in\().16b, {\in\().16b}, v14.16b
  74. eor \in\().16b, \in\().16b, v8.16b
  75. .endm
  76. .macro do_block, enc, in, rounds, rk, rkp, i
  77. ld1 {v15.4s}, [\rk]
  78. add \rkp, \rk, #16
  79. mov \i, \rounds
  80. 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
  81. movi v15.16b, #0x40
  82. tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
  83. sub_bytes \in
  84. subs \i, \i, #1
  85. ld1 {v15.4s}, [\rkp], #16
  86. beq 2222f
  87. mix_columns \in, \enc
  88. b 1111b
  89. 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
  90. .endm
  91. .macro encrypt_block, in, rounds, rk, rkp, i
  92. do_block 1, \in, \rounds, \rk, \rkp, \i
  93. .endm
  94. .macro decrypt_block, in, rounds, rk, rkp, i
  95. do_block 0, \in, \rounds, \rk, \rkp, \i
  96. .endm
  97. /*
  98. * Interleaved versions: functionally equivalent to the
  99. * ones above, but applied to 2 or 4 AES states in parallel.
  100. */
  101. .macro sub_bytes_2x, in0, in1
  102. sub v8.16b, \in0\().16b, v15.16b
  103. tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
  104. sub v9.16b, \in1\().16b, v15.16b
  105. tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
  106. sub v10.16b, v8.16b, v15.16b
  107. tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
  108. sub v11.16b, v9.16b, v15.16b
  109. tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
  110. sub v8.16b, v10.16b, v15.16b
  111. tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
  112. sub v9.16b, v11.16b, v15.16b
  113. tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
  114. tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
  115. tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
  116. .endm
  117. .macro sub_bytes_4x, in0, in1, in2, in3
  118. sub v8.16b, \in0\().16b, v15.16b
  119. tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
  120. sub v9.16b, \in1\().16b, v15.16b
  121. tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
  122. sub v10.16b, \in2\().16b, v15.16b
  123. tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
  124. sub v11.16b, \in3\().16b, v15.16b
  125. tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
  126. tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
  127. tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
  128. sub v8.16b, v8.16b, v15.16b
  129. tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
  130. sub v9.16b, v9.16b, v15.16b
  131. tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
  132. sub v10.16b, v10.16b, v15.16b
  133. tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
  134. sub v11.16b, v11.16b, v15.16b
  135. tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
  136. sub v8.16b, v8.16b, v15.16b
  137. tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
  138. sub v9.16b, v9.16b, v15.16b
  139. tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
  140. sub v10.16b, v10.16b, v15.16b
  141. tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
  142. sub v11.16b, v11.16b, v15.16b
  143. tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
  144. tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
  145. tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
  146. .endm
  147. .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
  148. sshr \tmp0\().16b, \in0\().16b, #7
  149. shl \out0\().16b, \in0\().16b, #1
  150. sshr \tmp1\().16b, \in1\().16b, #7
  151. and \tmp0\().16b, \tmp0\().16b, \const\().16b
  152. shl \out1\().16b, \in1\().16b, #1
  153. and \tmp1\().16b, \tmp1\().16b, \const\().16b
  154. eor \out0\().16b, \out0\().16b, \tmp0\().16b
  155. eor \out1\().16b, \out1\().16b, \tmp1\().16b
  156. .endm
  157. .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
  158. ushr \tmp0\().16b, \in0\().16b, #6
  159. shl \out0\().16b, \in0\().16b, #2
  160. ushr \tmp1\().16b, \in1\().16b, #6
  161. pmul \tmp0\().16b, \tmp0\().16b, \const\().16b
  162. shl \out1\().16b, \in1\().16b, #2
  163. pmul \tmp1\().16b, \tmp1\().16b, \const\().16b
  164. eor \out0\().16b, \out0\().16b, \tmp0\().16b
  165. eor \out1\().16b, \out1\().16b, \tmp1\().16b
  166. .endm
  167. .macro mix_columns_2x, in0, in1, enc
  168. .if \enc == 0
  169. /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
  170. mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
  171. eor \in0\().16b, \in0\().16b, v8.16b
  172. rev32 v8.8h, v8.8h
  173. eor \in1\().16b, \in1\().16b, v9.16b
  174. rev32 v9.8h, v9.8h
  175. eor \in0\().16b, \in0\().16b, v8.16b
  176. eor \in1\().16b, \in1\().16b, v9.16b
  177. .endif
  178. mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
  179. rev32 v10.8h, \in0\().8h
  180. rev32 v11.8h, \in1\().8h
  181. eor v10.16b, v10.16b, v8.16b
  182. eor v11.16b, v11.16b, v9.16b
  183. eor \in0\().16b, \in0\().16b, v10.16b
  184. eor \in1\().16b, \in1\().16b, v11.16b
  185. tbl \in0\().16b, {\in0\().16b}, v14.16b
  186. tbl \in1\().16b, {\in1\().16b}, v14.16b
  187. eor \in0\().16b, \in0\().16b, v10.16b
  188. eor \in1\().16b, \in1\().16b, v11.16b
  189. .endm
  190. .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i
  191. ld1 {v15.4s}, [\rk]
  192. add \rkp, \rk, #16
  193. mov \i, \rounds
  194. 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  195. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  196. movi v15.16b, #0x40
  197. tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
  198. tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
  199. sub_bytes_2x \in0, \in1
  200. subs \i, \i, #1
  201. ld1 {v15.4s}, [\rkp], #16
  202. beq 2222f
  203. mix_columns_2x \in0, \in1, \enc
  204. b 1111b
  205. 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  206. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  207. .endm
  208. .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
  209. ld1 {v15.4s}, [\rk]
  210. add \rkp, \rk, #16
  211. mov \i, \rounds
  212. 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  213. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  214. eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
  215. eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
  216. movi v15.16b, #0x40
  217. tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
  218. tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
  219. tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
  220. tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
  221. sub_bytes_4x \in0, \in1, \in2, \in3
  222. subs \i, \i, #1
  223. ld1 {v15.4s}, [\rkp], #16
  224. beq 2222f
  225. mix_columns_2x \in0, \in1, \enc
  226. mix_columns_2x \in2, \in3, \enc
  227. b 1111b
  228. 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  229. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  230. eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
  231. eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
  232. .endm
  233. .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
  234. do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
  235. .endm
  236. .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
  237. do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
  238. .endm
  239. .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
  240. do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
  241. .endm
  242. .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
  243. do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
  244. .endm
  245. #include "aes-modes.S"
  246. .section ".rodata", "a"
  247. .align 6
  248. .LForward_Sbox:
  249. .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
  250. .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
  251. .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
  252. .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
  253. .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
  254. .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
  255. .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
  256. .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
  257. .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
  258. .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
  259. .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
  260. .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
  261. .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
  262. .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
  263. .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
  264. .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
  265. .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
  266. .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
  267. .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
  268. .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
  269. .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
  270. .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
  271. .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
  272. .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
  273. .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
  274. .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
  275. .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
  276. .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
  277. .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
  278. .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
  279. .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
  280. .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
  281. .LReverse_Sbox:
  282. .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
  283. .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
  284. .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
  285. .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
  286. .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
  287. .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
  288. .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
  289. .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
  290. .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
  291. .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
  292. .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
  293. .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
  294. .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
  295. .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
  296. .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
  297. .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
  298. .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
  299. .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
  300. .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
  301. .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
  302. .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
  303. .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
  304. .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
  305. .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
  306. .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
  307. .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
  308. .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
  309. .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
  310. .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
  311. .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
  312. .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
  313. .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
  314. .LForward_ShiftRows:
  315. .octa 0x0b06010c07020d08030e09040f0a0500
  316. .LReverse_ShiftRows:
  317. .octa 0x0306090c0f0205080b0e0104070a0d00
  318. .Lror32by8:
  319. .octa 0x0c0f0e0d080b0a090407060500030201