ghash-ce-core.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  4. *
  5. * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  6. */
  7. #include <linux/linkage.h>
  8. #include <linux/cfi_types.h>
  9. #include <asm/assembler.h>
  10. SHASH .req v0
  11. SHASH2 .req v1
  12. T1 .req v2
  13. T2 .req v3
  14. MASK .req v4
  15. XM .req v5
  16. XL .req v6
  17. XH .req v7
  18. IN1 .req v7
  19. k00_16 .req v8
  20. k32_48 .req v9
  21. t3 .req v10
  22. t4 .req v11
  23. t5 .req v12
  24. t6 .req v13
  25. t7 .req v14
  26. t8 .req v15
  27. t9 .req v16
  28. perm1 .req v17
  29. perm2 .req v18
  30. perm3 .req v19
  31. sh1 .req v20
  32. sh2 .req v21
  33. sh3 .req v22
  34. sh4 .req v23
  35. ss1 .req v24
  36. ss2 .req v25
  37. ss3 .req v26
  38. ss4 .req v27
  39. XL2 .req v8
  40. XM2 .req v9
  41. XH2 .req v10
  42. XL3 .req v11
  43. XM3 .req v12
  44. XH3 .req v13
  45. TT3 .req v14
  46. TT4 .req v15
  47. HH .req v16
  48. HH3 .req v17
  49. HH4 .req v18
  50. HH34 .req v19
  51. .text
  52. .arch armv8-a+crypto
  53. .macro __pmull_p64, rd, rn, rm
  54. pmull \rd\().1q, \rn\().1d, \rm\().1d
  55. .endm
  56. .macro __pmull2_p64, rd, rn, rm
  57. pmull2 \rd\().1q, \rn\().2d, \rm\().2d
  58. .endm
  59. .macro __pmull_p8, rq, ad, bd
  60. ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
  61. ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
  62. ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
  63. __pmull_p8_\bd \rq, \ad
  64. .endm
  65. .macro __pmull2_p8, rq, ad, bd
  66. tbl t3.16b, {\ad\().16b}, perm1.16b // A1
  67. tbl t5.16b, {\ad\().16b}, perm2.16b // A2
  68. tbl t7.16b, {\ad\().16b}, perm3.16b // A3
  69. __pmull2_p8_\bd \rq, \ad
  70. .endm
  71. .macro __pmull_p8_SHASH, rq, ad
  72. __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
  73. .endm
  74. .macro __pmull_p8_SHASH2, rq, ad
  75. __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
  76. .endm
  77. .macro __pmull2_p8_SHASH, rq, ad
  78. __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
  79. .endm
  80. .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
  81. pmull\t t3.8h, t3.\nb, \bd // F = A1*B
  82. pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
  83. pmull\t t5.8h, t5.\nb, \bd // H = A2*B
  84. pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
  85. pmull\t t7.8h, t7.\nb, \bd // J = A3*B
  86. pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
  87. pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
  88. pmull\t \rq\().8h, \ad, \bd // D = A*B
  89. eor t3.16b, t3.16b, t4.16b // L = E + F
  90. eor t5.16b, t5.16b, t6.16b // M = G + H
  91. eor t7.16b, t7.16b, t8.16b // N = I + J
  92. uzp1 t4.2d, t3.2d, t5.2d
  93. uzp2 t3.2d, t3.2d, t5.2d
  94. uzp1 t6.2d, t7.2d, t9.2d
  95. uzp2 t7.2d, t7.2d, t9.2d
  96. // t3 = (L) (P0 + P1) << 8
  97. // t5 = (M) (P2 + P3) << 16
  98. eor t4.16b, t4.16b, t3.16b
  99. and t3.16b, t3.16b, k32_48.16b
  100. // t7 = (N) (P4 + P5) << 24
  101. // t9 = (K) (P6 + P7) << 32
  102. eor t6.16b, t6.16b, t7.16b
  103. and t7.16b, t7.16b, k00_16.16b
  104. eor t4.16b, t4.16b, t3.16b
  105. eor t6.16b, t6.16b, t7.16b
  106. zip2 t5.2d, t4.2d, t3.2d
  107. zip1 t3.2d, t4.2d, t3.2d
  108. zip2 t9.2d, t6.2d, t7.2d
  109. zip1 t7.2d, t6.2d, t7.2d
  110. ext t3.16b, t3.16b, t3.16b, #15
  111. ext t5.16b, t5.16b, t5.16b, #14
  112. ext t7.16b, t7.16b, t7.16b, #13
  113. ext t9.16b, t9.16b, t9.16b, #12
  114. eor t3.16b, t3.16b, t5.16b
  115. eor t7.16b, t7.16b, t9.16b
  116. eor \rq\().16b, \rq\().16b, t3.16b
  117. eor \rq\().16b, \rq\().16b, t7.16b
  118. .endm
  119. .macro __pmull_pre_p64
  120. add x8, x3, #16
  121. ld1 {HH.2d-HH4.2d}, [x8]
  122. trn1 SHASH2.2d, SHASH.2d, HH.2d
  123. trn2 T1.2d, SHASH.2d, HH.2d
  124. eor SHASH2.16b, SHASH2.16b, T1.16b
  125. trn1 HH34.2d, HH3.2d, HH4.2d
  126. trn2 T1.2d, HH3.2d, HH4.2d
  127. eor HH34.16b, HH34.16b, T1.16b
  128. movi MASK.16b, #0xe1
  129. shl MASK.2d, MASK.2d, #57
  130. .endm
  131. .macro __pmull_pre_p8
  132. ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
  133. eor SHASH2.16b, SHASH2.16b, SHASH.16b
  134. // k00_16 := 0x0000000000000000_000000000000ffff
  135. // k32_48 := 0x00000000ffffffff_0000ffffffffffff
  136. movi k32_48.2d, #0xffffffff
  137. mov k32_48.h[2], k32_48.h[0]
  138. ushr k00_16.2d, k32_48.2d, #32
  139. // prepare the permutation vectors
  140. mov_q x5, 0x080f0e0d0c0b0a09
  141. movi T1.8b, #8
  142. dup perm1.2d, x5
  143. eor perm1.16b, perm1.16b, T1.16b
  144. ushr perm2.2d, perm1.2d, #8
  145. ushr perm3.2d, perm1.2d, #16
  146. ushr T1.2d, perm1.2d, #24
  147. sli perm2.2d, perm1.2d, #56
  148. sli perm3.2d, perm1.2d, #48
  149. sli T1.2d, perm1.2d, #40
  150. // precompute loop invariants
  151. tbl sh1.16b, {SHASH.16b}, perm1.16b
  152. tbl sh2.16b, {SHASH.16b}, perm2.16b
  153. tbl sh3.16b, {SHASH.16b}, perm3.16b
  154. tbl sh4.16b, {SHASH.16b}, T1.16b
  155. ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
  156. ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
  157. ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
  158. ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
  159. .endm
  160. //
  161. // PMULL (64x64->128) based reduction for CPUs that can do
  162. // it in a single instruction.
  163. //
  164. .macro __pmull_reduce_p64
  165. pmull T2.1q, XL.1d, MASK.1d
  166. eor XM.16b, XM.16b, T1.16b
  167. mov XH.d[0], XM.d[1]
  168. mov XM.d[1], XL.d[0]
  169. eor XL.16b, XM.16b, T2.16b
  170. ext T2.16b, XL.16b, XL.16b, #8
  171. pmull XL.1q, XL.1d, MASK.1d
  172. .endm
  173. //
  174. // Alternative reduction for CPUs that lack support for the
  175. // 64x64->128 PMULL instruction
  176. //
  177. .macro __pmull_reduce_p8
  178. eor XM.16b, XM.16b, T1.16b
  179. mov XL.d[1], XM.d[0]
  180. mov XH.d[0], XM.d[1]
  181. shl T1.2d, XL.2d, #57
  182. shl T2.2d, XL.2d, #62
  183. eor T2.16b, T2.16b, T1.16b
  184. shl T1.2d, XL.2d, #63
  185. eor T2.16b, T2.16b, T1.16b
  186. ext T1.16b, XL.16b, XH.16b, #8
  187. eor T2.16b, T2.16b, T1.16b
  188. mov XL.d[1], T2.d[0]
  189. mov XH.d[0], T2.d[1]
  190. ushr T2.2d, XL.2d, #1
  191. eor XH.16b, XH.16b, XL.16b
  192. eor XL.16b, XL.16b, T2.16b
  193. ushr T2.2d, T2.2d, #6
  194. ushr XL.2d, XL.2d, #1
  195. .endm
  196. .macro __pmull_ghash, pn
  197. ld1 {SHASH.2d}, [x3]
  198. ld1 {XL.2d}, [x1]
  199. __pmull_pre_\pn
  200. /* do the head block first, if supplied */
  201. cbz x4, 0f
  202. ld1 {T1.2d}, [x4]
  203. mov x4, xzr
  204. b 3f
  205. 0: .ifc \pn, p64
  206. tbnz w0, #0, 2f // skip until #blocks is a
  207. tbnz w0, #1, 2f // round multiple of 4
  208. 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
  209. sub w0, w0, #4
  210. rev64 T1.16b, XM3.16b
  211. rev64 T2.16b, XH3.16b
  212. rev64 TT4.16b, TT4.16b
  213. rev64 TT3.16b, TT3.16b
  214. ext IN1.16b, TT4.16b, TT4.16b, #8
  215. ext XL3.16b, TT3.16b, TT3.16b, #8
  216. eor TT4.16b, TT4.16b, IN1.16b
  217. pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
  218. pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
  219. pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
  220. eor TT3.16b, TT3.16b, XL3.16b
  221. pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
  222. pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
  223. pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
  224. ext IN1.16b, T2.16b, T2.16b, #8
  225. eor XL2.16b, XL2.16b, XL3.16b
  226. eor XH2.16b, XH2.16b, XH3.16b
  227. eor XM2.16b, XM2.16b, XM3.16b
  228. eor T2.16b, T2.16b, IN1.16b
  229. pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
  230. pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
  231. pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
  232. eor XL2.16b, XL2.16b, XL3.16b
  233. eor XH2.16b, XH2.16b, XH3.16b
  234. eor XM2.16b, XM2.16b, XM3.16b
  235. ext IN1.16b, T1.16b, T1.16b, #8
  236. ext TT3.16b, XL.16b, XL.16b, #8
  237. eor XL.16b, XL.16b, IN1.16b
  238. eor T1.16b, T1.16b, TT3.16b
  239. pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
  240. eor T1.16b, T1.16b, XL.16b
  241. pmull XL.1q, HH4.1d, XL.1d // a0 * b0
  242. pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
  243. eor XL.16b, XL.16b, XL2.16b
  244. eor XH.16b, XH.16b, XH2.16b
  245. eor XM.16b, XM.16b, XM2.16b
  246. eor T2.16b, XL.16b, XH.16b
  247. ext T1.16b, XL.16b, XH.16b, #8
  248. eor XM.16b, XM.16b, T2.16b
  249. __pmull_reduce_p64
  250. eor T2.16b, T2.16b, XH.16b
  251. eor XL.16b, XL.16b, T2.16b
  252. cbz w0, 5f
  253. b 1b
  254. .endif
  255. 2: ld1 {T1.2d}, [x2], #16
  256. sub w0, w0, #1
  257. 3: /* multiply XL by SHASH in GF(2^128) */
  258. CPU_LE( rev64 T1.16b, T1.16b )
  259. ext T2.16b, XL.16b, XL.16b, #8
  260. ext IN1.16b, T1.16b, T1.16b, #8
  261. eor T1.16b, T1.16b, T2.16b
  262. eor XL.16b, XL.16b, IN1.16b
  263. __pmull2_\pn XH, XL, SHASH // a1 * b1
  264. eor T1.16b, T1.16b, XL.16b
  265. __pmull_\pn XL, XL, SHASH // a0 * b0
  266. __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
  267. 4: eor T2.16b, XL.16b, XH.16b
  268. ext T1.16b, XL.16b, XH.16b, #8
  269. eor XM.16b, XM.16b, T2.16b
  270. __pmull_reduce_\pn
  271. eor T2.16b, T2.16b, XH.16b
  272. eor XL.16b, XL.16b, T2.16b
  273. cbnz w0, 0b
  274. 5: st1 {XL.2d}, [x1]
  275. ret
  276. .endm
  277. /*
  278. * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
  279. * struct ghash_key const *k, const char *head)
  280. */
  281. SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
  282. __pmull_ghash p64
  283. SYM_FUNC_END(pmull_ghash_update_p64)
  284. SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
  285. __pmull_ghash p8
  286. SYM_FUNC_END(pmull_ghash_update_p8)
  287. KS0 .req v8
  288. KS1 .req v9
  289. KS2 .req v10
  290. KS3 .req v11
  291. INP0 .req v21
  292. INP1 .req v22
  293. INP2 .req v23
  294. INP3 .req v24
  295. K0 .req v25
  296. K1 .req v26
  297. K2 .req v27
  298. K3 .req v28
  299. K4 .req v12
  300. K5 .req v13
  301. K6 .req v4
  302. K7 .req v5
  303. K8 .req v14
  304. K9 .req v15
  305. KK .req v29
  306. KL .req v30
  307. KM .req v31
  308. .macro load_round_keys, rounds, rk, tmp
  309. add \tmp, \rk, #64
  310. ld1 {K0.4s-K3.4s}, [\rk]
  311. ld1 {K4.4s-K5.4s}, [\tmp]
  312. add \tmp, \rk, \rounds, lsl #4
  313. sub \tmp, \tmp, #32
  314. ld1 {KK.4s-KM.4s}, [\tmp]
  315. .endm
  316. .macro enc_round, state, key
  317. aese \state\().16b, \key\().16b
  318. aesmc \state\().16b, \state\().16b
  319. .endm
  320. .macro enc_qround, s0, s1, s2, s3, key
  321. enc_round \s0, \key
  322. enc_round \s1, \key
  323. enc_round \s2, \key
  324. enc_round \s3, \key
  325. .endm
  326. .macro enc_block, state, rounds, rk, tmp
  327. add \tmp, \rk, #96
  328. ld1 {K6.4s-K7.4s}, [\tmp], #32
  329. .irp key, K0, K1, K2, K3, K4 K5
  330. enc_round \state, \key
  331. .endr
  332. tbnz \rounds, #2, .Lnot128_\@
  333. .Lout256_\@:
  334. enc_round \state, K6
  335. enc_round \state, K7
  336. .Lout192_\@:
  337. enc_round \state, KK
  338. aese \state\().16b, KL.16b
  339. eor \state\().16b, \state\().16b, KM.16b
  340. .subsection 1
  341. .Lnot128_\@:
  342. ld1 {K8.4s-K9.4s}, [\tmp], #32
  343. enc_round \state, K6
  344. enc_round \state, K7
  345. ld1 {K6.4s-K7.4s}, [\tmp]
  346. enc_round \state, K8
  347. enc_round \state, K9
  348. tbz \rounds, #1, .Lout192_\@
  349. b .Lout256_\@
  350. .previous
  351. .endm
  352. .align 6
  353. .macro pmull_gcm_do_crypt, enc
  354. frame_push 1
  355. load_round_keys x7, x6, x8
  356. ld1 {SHASH.2d}, [x3], #16
  357. ld1 {HH.2d-HH4.2d}, [x3]
  358. trn1 SHASH2.2d, SHASH.2d, HH.2d
  359. trn2 T1.2d, SHASH.2d, HH.2d
  360. eor SHASH2.16b, SHASH2.16b, T1.16b
  361. trn1 HH34.2d, HH3.2d, HH4.2d
  362. trn2 T1.2d, HH3.2d, HH4.2d
  363. eor HH34.16b, HH34.16b, T1.16b
  364. ld1 {XL.2d}, [x4]
  365. cbz x0, 3f // tag only?
  366. ldr w8, [x5, #12] // load lower counter
  367. CPU_LE( rev w8, w8 )
  368. 0: mov w9, #4 // max blocks per round
  369. add x10, x0, #0xf
  370. lsr x10, x10, #4 // remaining blocks
  371. subs x0, x0, #64
  372. csel w9, w10, w9, mi
  373. add w8, w8, w9
  374. bmi 1f
  375. ld1 {INP0.16b-INP3.16b}, [x2], #64
  376. .subsection 1
  377. /*
  378. * Populate the four input registers right to left with up to 63 bytes
  379. * of data, using overlapping loads to avoid branches.
  380. *
  381. * INP0 INP1 INP2 INP3
  382. * 1 byte | | | |x |
  383. * 16 bytes | | | |xxxxxxxx|
  384. * 17 bytes | | |xxxxxxxx|x |
  385. * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
  386. * etc etc
  387. *
  388. * Note that this code may read up to 15 bytes before the start of
  389. * the input. It is up to the calling code to ensure this is safe if
  390. * this happens in the first iteration of the loop (i.e., when the
  391. * input size is < 16 bytes)
  392. */
  393. 1: mov x15, #16
  394. ands x19, x0, #0xf
  395. csel x19, x19, x15, ne
  396. adr_l x17, .Lpermute_table + 16
  397. sub x11, x15, x19
  398. add x12, x17, x11
  399. sub x17, x17, x11
  400. ld1 {T1.16b}, [x12]
  401. sub x10, x1, x11
  402. sub x11, x2, x11
  403. cmp x0, #-16
  404. csel x14, x15, xzr, gt
  405. cmp x0, #-32
  406. csel x15, x15, xzr, gt
  407. cmp x0, #-48
  408. csel x16, x19, xzr, gt
  409. csel x1, x1, x10, gt
  410. csel x2, x2, x11, gt
  411. ld1 {INP0.16b}, [x2], x14
  412. ld1 {INP1.16b}, [x2], x15
  413. ld1 {INP2.16b}, [x2], x16
  414. ld1 {INP3.16b}, [x2]
  415. tbl INP3.16b, {INP3.16b}, T1.16b
  416. b 2f
  417. .previous
  418. 2: .if \enc == 0
  419. bl pmull_gcm_ghash_4x
  420. .endif
  421. bl pmull_gcm_enc_4x
  422. tbnz x0, #63, 6f
  423. st1 {INP0.16b-INP3.16b}, [x1], #64
  424. .if \enc == 1
  425. bl pmull_gcm_ghash_4x
  426. .endif
  427. bne 0b
  428. 3: ldr x10, [sp, #.Lframe_local_offset]
  429. cbz x10, 5f // output tag?
  430. ld1 {INP3.16b}, [x10] // load lengths[]
  431. mov w9, #1
  432. bl pmull_gcm_ghash_4x
  433. mov w11, #(0x1 << 24) // BE '1U'
  434. ld1 {KS0.16b}, [x5]
  435. mov KS0.s[3], w11
  436. enc_block KS0, x7, x6, x12
  437. ext XL.16b, XL.16b, XL.16b, #8
  438. rev64 XL.16b, XL.16b
  439. eor XL.16b, XL.16b, KS0.16b
  440. .if \enc == 1
  441. st1 {XL.16b}, [x10] // store tag
  442. .else
  443. ldp x11, x12, [sp, #40] // load tag pointer and authsize
  444. adr_l x17, .Lpermute_table
  445. ld1 {KS0.16b}, [x11] // load supplied tag
  446. add x17, x17, x12
  447. ld1 {KS1.16b}, [x17] // load permute vector
  448. cmeq XL.16b, XL.16b, KS0.16b // compare tags
  449. mvn XL.16b, XL.16b // -1 for fail, 0 for pass
  450. tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
  451. sminv b0, XL.16b // signed minimum across XL
  452. smov w0, v0.b[0] // return b0
  453. .endif
  454. 4: frame_pop
  455. ret
  456. 5:
  457. CPU_LE( rev w8, w8 )
  458. str w8, [x5, #12] // store lower counter
  459. st1 {XL.2d}, [x4]
  460. b 4b
  461. 6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
  462. sub x17, x17, x19, lsl #1
  463. cmp w9, #1
  464. beq 7f
  465. .subsection 1
  466. 7: ld1 {INP2.16b}, [x1]
  467. tbx INP2.16b, {INP3.16b}, T1.16b
  468. mov INP3.16b, INP2.16b
  469. b 8f
  470. .previous
  471. st1 {INP0.16b}, [x1], x14
  472. st1 {INP1.16b}, [x1], x15
  473. st1 {INP2.16b}, [x1], x16
  474. tbl INP3.16b, {INP3.16b}, T1.16b
  475. tbx INP3.16b, {INP2.16b}, T2.16b
  476. 8: st1 {INP3.16b}, [x1]
  477. .if \enc == 1
  478. ld1 {T1.16b}, [x17]
  479. tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
  480. bl pmull_gcm_ghash_4x
  481. .endif
  482. b 3b
  483. .endm
  484. /*
  485. * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
  486. * struct ghash_key const *k, u64 dg[], u8 ctr[],
  487. * int rounds, u8 tag)
  488. */
  489. SYM_FUNC_START(pmull_gcm_encrypt)
  490. pmull_gcm_do_crypt 1
  491. SYM_FUNC_END(pmull_gcm_encrypt)
  492. /*
  493. * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
  494. * struct ghash_key const *k, u64 dg[], u8 ctr[],
  495. * int rounds, u8 tag)
  496. */
  497. SYM_FUNC_START(pmull_gcm_decrypt)
  498. pmull_gcm_do_crypt 0
  499. SYM_FUNC_END(pmull_gcm_decrypt)
  500. SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
  501. movi MASK.16b, #0xe1
  502. shl MASK.2d, MASK.2d, #57
  503. rev64 T1.16b, INP0.16b
  504. rev64 T2.16b, INP1.16b
  505. rev64 TT3.16b, INP2.16b
  506. rev64 TT4.16b, INP3.16b
  507. ext XL.16b, XL.16b, XL.16b, #8
  508. tbz w9, #2, 0f // <4 blocks?
  509. .subsection 1
  510. 0: movi XH2.16b, #0
  511. movi XM2.16b, #0
  512. movi XL2.16b, #0
  513. tbz w9, #0, 1f // 2 blocks?
  514. tbz w9, #1, 2f // 1 block?
  515. eor T2.16b, T2.16b, XL.16b
  516. ext T1.16b, T2.16b, T2.16b, #8
  517. b .Lgh3
  518. 1: eor TT3.16b, TT3.16b, XL.16b
  519. ext T2.16b, TT3.16b, TT3.16b, #8
  520. b .Lgh2
  521. 2: eor TT4.16b, TT4.16b, XL.16b
  522. ext IN1.16b, TT4.16b, TT4.16b, #8
  523. b .Lgh1
  524. .previous
  525. eor T1.16b, T1.16b, XL.16b
  526. ext IN1.16b, T1.16b, T1.16b, #8
  527. pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
  528. eor T1.16b, T1.16b, IN1.16b
  529. pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
  530. pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
  531. ext T1.16b, T2.16b, T2.16b, #8
  532. .Lgh3: eor T2.16b, T2.16b, T1.16b
  533. pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
  534. pmull XL.1q, HH3.1d, T1.1d // a0 * b0
  535. pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
  536. eor XH2.16b, XH2.16b, XH.16b
  537. eor XL2.16b, XL2.16b, XL.16b
  538. eor XM2.16b, XM2.16b, XM.16b
  539. ext T2.16b, TT3.16b, TT3.16b, #8
  540. .Lgh2: eor TT3.16b, TT3.16b, T2.16b
  541. pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
  542. pmull XL.1q, HH.1d, T2.1d // a0 * b0
  543. pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
  544. eor XH2.16b, XH2.16b, XH.16b
  545. eor XL2.16b, XL2.16b, XL.16b
  546. eor XM2.16b, XM2.16b, XM.16b
  547. ext IN1.16b, TT4.16b, TT4.16b, #8
  548. .Lgh1: eor TT4.16b, TT4.16b, IN1.16b
  549. pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
  550. pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
  551. pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
  552. eor XH.16b, XH.16b, XH2.16b
  553. eor XL.16b, XL.16b, XL2.16b
  554. eor XM.16b, XM.16b, XM2.16b
  555. eor T2.16b, XL.16b, XH.16b
  556. ext T1.16b, XL.16b, XH.16b, #8
  557. eor XM.16b, XM.16b, T2.16b
  558. __pmull_reduce_p64
  559. eor T2.16b, T2.16b, XH.16b
  560. eor XL.16b, XL.16b, T2.16b
  561. ret
  562. SYM_FUNC_END(pmull_gcm_ghash_4x)
  563. SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
  564. ld1 {KS0.16b}, [x5] // load upper counter
  565. sub w10, w8, #4
  566. sub w11, w8, #3
  567. sub w12, w8, #2
  568. sub w13, w8, #1
  569. rev w10, w10
  570. rev w11, w11
  571. rev w12, w12
  572. rev w13, w13
  573. mov KS1.16b, KS0.16b
  574. mov KS2.16b, KS0.16b
  575. mov KS3.16b, KS0.16b
  576. ins KS0.s[3], w10 // set lower counter
  577. ins KS1.s[3], w11
  578. ins KS2.s[3], w12
  579. ins KS3.s[3], w13
  580. add x10, x6, #96 // round key pointer
  581. ld1 {K6.4s-K7.4s}, [x10], #32
  582. .irp key, K0, K1, K2, K3, K4, K5
  583. enc_qround KS0, KS1, KS2, KS3, \key
  584. .endr
  585. tbnz x7, #2, .Lnot128
  586. .subsection 1
  587. .Lnot128:
  588. ld1 {K8.4s-K9.4s}, [x10], #32
  589. .irp key, K6, K7
  590. enc_qround KS0, KS1, KS2, KS3, \key
  591. .endr
  592. ld1 {K6.4s-K7.4s}, [x10]
  593. .irp key, K8, K9
  594. enc_qround KS0, KS1, KS2, KS3, \key
  595. .endr
  596. tbz x7, #1, .Lout192
  597. b .Lout256
  598. .previous
  599. .Lout256:
  600. .irp key, K6, K7
  601. enc_qround KS0, KS1, KS2, KS3, \key
  602. .endr
  603. .Lout192:
  604. enc_qround KS0, KS1, KS2, KS3, KK
  605. aese KS0.16b, KL.16b
  606. aese KS1.16b, KL.16b
  607. aese KS2.16b, KL.16b
  608. aese KS3.16b, KL.16b
  609. eor KS0.16b, KS0.16b, KM.16b
  610. eor KS1.16b, KS1.16b, KM.16b
  611. eor KS2.16b, KS2.16b, KM.16b
  612. eor KS3.16b, KS3.16b, KM.16b
  613. eor INP0.16b, INP0.16b, KS0.16b
  614. eor INP1.16b, INP1.16b, KS1.16b
  615. eor INP2.16b, INP2.16b, KS2.16b
  616. eor INP3.16b, INP3.16b, KS3.16b
  617. ret
  618. SYM_FUNC_END(pmull_gcm_enc_4x)
  619. .section ".rodata", "a"
  620. .align 6
  621. .Lpermute_table:
  622. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  623. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  624. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  625. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
  626. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  627. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  628. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  629. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
  630. .previous