ghash-ce-core.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
  4. *
  5. * Copyright (C) 2015 - 2017 Linaro Ltd.
  6. * Copyright (C) 2023 Google LLC. <ardb@google.com>
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/assembler.h>
  10. .arch armv8-a
  11. .fpu crypto-neon-fp-armv8
  12. SHASH .req q0
  13. T1 .req q1
  14. XL .req q2
  15. XM .req q3
  16. XH .req q4
  17. IN1 .req q4
  18. SHASH_L .req d0
  19. SHASH_H .req d1
  20. T1_L .req d2
  21. T1_H .req d3
  22. XL_L .req d4
  23. XL_H .req d5
  24. XM_L .req d6
  25. XM_H .req d7
  26. XH_L .req d8
  27. t0l .req d10
  28. t0h .req d11
  29. t1l .req d12
  30. t1h .req d13
  31. t2l .req d14
  32. t2h .req d15
  33. t3l .req d16
  34. t3h .req d17
  35. t4l .req d18
  36. t4h .req d19
  37. t0q .req q5
  38. t1q .req q6
  39. t2q .req q7
  40. t3q .req q8
  41. t4q .req q9
  42. XH2 .req q9
  43. s1l .req d20
  44. s1h .req d21
  45. s2l .req d22
  46. s2h .req d23
  47. s3l .req d24
  48. s3h .req d25
  49. s4l .req d26
  50. s4h .req d27
  51. MASK .req d28
  52. SHASH2_p8 .req d28
  53. k16 .req d29
  54. k32 .req d30
  55. k48 .req d31
  56. SHASH2_p64 .req d31
  57. HH .req q10
  58. HH3 .req q11
  59. HH4 .req q12
  60. HH34 .req q13
  61. HH_L .req d20
  62. HH_H .req d21
  63. HH3_L .req d22
  64. HH3_H .req d23
  65. HH4_L .req d24
  66. HH4_H .req d25
  67. HH34_L .req d26
  68. HH34_H .req d27
  69. SHASH2_H .req d29
  70. XL2 .req q5
  71. XM2 .req q6
  72. T2 .req q7
  73. T3 .req q8
  74. XL2_L .req d10
  75. XL2_H .req d11
  76. XM2_L .req d12
  77. XM2_H .req d13
  78. T3_L .req d16
  79. T3_H .req d17
  80. .text
  81. .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
  82. vmull.p64 \rd, \rn, \rm
  83. .endm
  84. /*
  85. * This implementation of 64x64 -> 128 bit polynomial multiplication
  86. * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
  87. * "Fast Software Polynomial Multiplication on ARM Processors Using
  88. * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
  89. * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
  90. *
  91. * It has been slightly tweaked for in-order performance, and to allow
  92. * 'rq' to overlap with 'ad' or 'bd'.
  93. */
  94. .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
  95. vext.8 t0l, \ad, \ad, #1 @ A1
  96. .ifc \b1, t4l
  97. vext.8 t4l, \bd, \bd, #1 @ B1
  98. .endif
  99. vmull.p8 t0q, t0l, \bd @ F = A1*B
  100. vext.8 t1l, \ad, \ad, #2 @ A2
  101. vmull.p8 t4q, \ad, \b1 @ E = A*B1
  102. .ifc \b2, t3l
  103. vext.8 t3l, \bd, \bd, #2 @ B2
  104. .endif
  105. vmull.p8 t1q, t1l, \bd @ H = A2*B
  106. vext.8 t2l, \ad, \ad, #3 @ A3
  107. vmull.p8 t3q, \ad, \b2 @ G = A*B2
  108. veor t0q, t0q, t4q @ L = E + F
  109. .ifc \b3, t4l
  110. vext.8 t4l, \bd, \bd, #3 @ B3
  111. .endif
  112. vmull.p8 t2q, t2l, \bd @ J = A3*B
  113. veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
  114. veor t1q, t1q, t3q @ M = G + H
  115. .ifc \b4, t3l
  116. vext.8 t3l, \bd, \bd, #4 @ B4
  117. .endif
  118. vmull.p8 t4q, \ad, \b3 @ I = A*B3
  119. veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
  120. vmull.p8 t3q, \ad, \b4 @ K = A*B4
  121. vand t0h, t0h, k48
  122. vand t1h, t1h, k32
  123. veor t2q, t2q, t4q @ N = I + J
  124. veor t0l, t0l, t0h
  125. veor t1l, t1l, t1h
  126. veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
  127. vand t2h, t2h, k16
  128. veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
  129. vmov.i64 t3h, #0
  130. vext.8 t0q, t0q, t0q, #15
  131. veor t2l, t2l, t2h
  132. vext.8 t1q, t1q, t1q, #14
  133. vmull.p8 \rq, \ad, \bd @ D = A*B
  134. vext.8 t2q, t2q, t2q, #13
  135. vext.8 t3q, t3q, t3q, #12
  136. veor t0q, t0q, t1q
  137. veor t2q, t2q, t3q
  138. veor \rq, \rq, t0q
  139. veor \rq, \rq, t2q
  140. .endm
  141. //
  142. // PMULL (64x64->128) based reduction for CPUs that can do
  143. // it in a single instruction.
  144. //
  145. .macro __pmull_reduce_p64
  146. vmull.p64 T1, XL_L, MASK
  147. veor XH_L, XH_L, XM_H
  148. vext.8 T1, T1, T1, #8
  149. veor XL_H, XL_H, XM_L
  150. veor T1, T1, XL
  151. vmull.p64 XL, T1_H, MASK
  152. .endm
  153. //
  154. // Alternative reduction for CPUs that lack support for the
  155. // 64x64->128 PMULL instruction
  156. //
  157. .macro __pmull_reduce_p8
  158. veor XL_H, XL_H, XM_L
  159. veor XH_L, XH_L, XM_H
  160. vshl.i64 T1, XL, #57
  161. vshl.i64 T2, XL, #62
  162. veor T1, T1, T2
  163. vshl.i64 T2, XL, #63
  164. veor T1, T1, T2
  165. veor XL_H, XL_H, T1_L
  166. veor XH_L, XH_L, T1_H
  167. vshr.u64 T1, XL, #1
  168. veor XH, XH, XL
  169. veor XL, XL, T1
  170. vshr.u64 T1, T1, #6
  171. vshr.u64 XL, XL, #1
  172. .endm
  173. .macro ghash_update, pn, enc, aggregate=1, head=1
  174. vld1.64 {XL}, [r1]
  175. .if \head
  176. /* do the head block first, if supplied */
  177. ldr ip, [sp]
  178. teq ip, #0
  179. beq 0f
  180. vld1.64 {T1}, [ip]
  181. teq r0, #0
  182. b 3f
  183. .endif
  184. 0: .ifc \pn, p64
  185. .if \aggregate
  186. tst r0, #3 // skip until #blocks is a
  187. bne 2f // round multiple of 4
  188. vld1.8 {XL2-XM2}, [r2]!
  189. 1: vld1.8 {T2-T3}, [r2]!
  190. .ifnb \enc
  191. \enc\()_4x XL2, XM2, T2, T3
  192. add ip, r3, #16
  193. vld1.64 {HH}, [ip, :128]!
  194. vld1.64 {HH3-HH4}, [ip, :128]
  195. veor SHASH2_p64, SHASH_L, SHASH_H
  196. veor SHASH2_H, HH_L, HH_H
  197. veor HH34_L, HH3_L, HH3_H
  198. veor HH34_H, HH4_L, HH4_H
  199. vmov.i8 MASK, #0xe1
  200. vshl.u64 MASK, MASK, #57
  201. .endif
  202. vrev64.8 XL2, XL2
  203. vrev64.8 XM2, XM2
  204. subs r0, r0, #4
  205. vext.8 T1, XL2, XL2, #8
  206. veor XL2_H, XL2_H, XL_L
  207. veor XL, XL, T1
  208. vrev64.8 T1, T3
  209. vrev64.8 T3, T2
  210. vmull.p64 XH, HH4_H, XL_H // a1 * b1
  211. veor XL2_H, XL2_H, XL_H
  212. vmull.p64 XL, HH4_L, XL_L // a0 * b0
  213. vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
  214. vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
  215. veor XM2_L, XM2_L, XM2_H
  216. vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
  217. vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
  218. veor XH, XH, XH2
  219. veor XL, XL, XL2
  220. veor XM, XM, XM2
  221. vmull.p64 XH2, HH_H, T3_L // a1 * b1
  222. veor T3_L, T3_L, T3_H
  223. vmull.p64 XL2, HH_L, T3_H // a0 * b0
  224. vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
  225. veor XH, XH, XH2
  226. veor XL, XL, XL2
  227. veor XM, XM, XM2
  228. vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
  229. veor T1_L, T1_L, T1_H
  230. vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
  231. vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
  232. veor XH, XH, XH2
  233. veor XL, XL, XL2
  234. veor XM, XM, XM2
  235. beq 4f
  236. vld1.8 {XL2-XM2}, [r2]!
  237. veor T1, XL, XH
  238. veor XM, XM, T1
  239. __pmull_reduce_p64
  240. veor T1, T1, XH
  241. veor XL, XL, T1
  242. b 1b
  243. .endif
  244. .endif
  245. 2: vld1.8 {T1}, [r2]!
  246. .ifnb \enc
  247. \enc\()_1x T1
  248. veor SHASH2_p64, SHASH_L, SHASH_H
  249. vmov.i8 MASK, #0xe1
  250. vshl.u64 MASK, MASK, #57
  251. .endif
  252. subs r0, r0, #1
  253. 3: /* multiply XL by SHASH in GF(2^128) */
  254. vrev64.8 T1, T1
  255. vext.8 IN1, T1, T1, #8
  256. veor T1_L, T1_L, XL_H
  257. veor XL, XL, IN1
  258. __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
  259. veor T1, T1, XL
  260. __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
  261. __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
  262. 4: veor T1, XL, XH
  263. veor XM, XM, T1
  264. __pmull_reduce_\pn
  265. veor T1, T1, XH
  266. veor XL, XL, T1
  267. bne 0b
  268. .endm
  269. /*
  270. * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
  271. * struct ghash_key const *k, const char *head)
  272. */
  273. ENTRY(pmull_ghash_update_p64)
  274. vld1.64 {SHASH}, [r3]!
  275. vld1.64 {HH}, [r3]!
  276. vld1.64 {HH3-HH4}, [r3]
  277. veor SHASH2_p64, SHASH_L, SHASH_H
  278. veor SHASH2_H, HH_L, HH_H
  279. veor HH34_L, HH3_L, HH3_H
  280. veor HH34_H, HH4_L, HH4_H
  281. vmov.i8 MASK, #0xe1
  282. vshl.u64 MASK, MASK, #57
  283. ghash_update p64
  284. vst1.64 {XL}, [r1]
  285. bx lr
  286. ENDPROC(pmull_ghash_update_p64)
  287. ENTRY(pmull_ghash_update_p8)
  288. vld1.64 {SHASH}, [r3]
  289. veor SHASH2_p8, SHASH_L, SHASH_H
  290. vext.8 s1l, SHASH_L, SHASH_L, #1
  291. vext.8 s2l, SHASH_L, SHASH_L, #2
  292. vext.8 s3l, SHASH_L, SHASH_L, #3
  293. vext.8 s4l, SHASH_L, SHASH_L, #4
  294. vext.8 s1h, SHASH_H, SHASH_H, #1
  295. vext.8 s2h, SHASH_H, SHASH_H, #2
  296. vext.8 s3h, SHASH_H, SHASH_H, #3
  297. vext.8 s4h, SHASH_H, SHASH_H, #4
  298. vmov.i64 k16, #0xffff
  299. vmov.i64 k32, #0xffffffff
  300. vmov.i64 k48, #0xffffffffffff
  301. ghash_update p8
  302. vst1.64 {XL}, [r1]
  303. bx lr
  304. ENDPROC(pmull_ghash_update_p8)
  305. e0 .req q9
  306. e1 .req q10
  307. e2 .req q11
  308. e3 .req q12
  309. e0l .req d18
  310. e0h .req d19
  311. e2l .req d22
  312. e2h .req d23
  313. e3l .req d24
  314. e3h .req d25
  315. ctr .req q13
  316. ctr0 .req d26
  317. ctr1 .req d27
  318. ek0 .req q14
  319. ek1 .req q15
  320. .macro round, rk:req, regs:vararg
  321. .irp r, \regs
  322. aese.8 \r, \rk
  323. aesmc.8 \r, \r
  324. .endr
  325. .endm
  326. .macro aes_encrypt, rkp, rounds, regs:vararg
  327. vld1.8 {ek0-ek1}, [\rkp, :128]!
  328. cmp \rounds, #12
  329. blt .L\@ // AES-128
  330. round ek0, \regs
  331. vld1.8 {ek0}, [\rkp, :128]!
  332. round ek1, \regs
  333. vld1.8 {ek1}, [\rkp, :128]!
  334. beq .L\@ // AES-192
  335. round ek0, \regs
  336. vld1.8 {ek0}, [\rkp, :128]!
  337. round ek1, \regs
  338. vld1.8 {ek1}, [\rkp, :128]!
  339. .L\@: .rept 4
  340. round ek0, \regs
  341. vld1.8 {ek0}, [\rkp, :128]!
  342. round ek1, \regs
  343. vld1.8 {ek1}, [\rkp, :128]!
  344. .endr
  345. round ek0, \regs
  346. vld1.8 {ek0}, [\rkp, :128]
  347. .irp r, \regs
  348. aese.8 \r, ek1
  349. .endr
  350. .irp r, \regs
  351. veor \r, \r, ek0
  352. .endr
  353. .endm
  354. pmull_aes_encrypt:
  355. add ip, r5, #4
  356. vld1.8 {ctr0}, [r5] // load 12 byte IV
  357. vld1.8 {ctr1}, [ip]
  358. rev r8, r7
  359. vext.8 ctr1, ctr1, ctr1, #4
  360. add r7, r7, #1
  361. vmov.32 ctr1[1], r8
  362. vmov e0, ctr
  363. add ip, r3, #64
  364. aes_encrypt ip, r6, e0
  365. bx lr
  366. ENDPROC(pmull_aes_encrypt)
  367. pmull_aes_encrypt_4x:
  368. add ip, r5, #4
  369. vld1.8 {ctr0}, [r5]
  370. vld1.8 {ctr1}, [ip]
  371. rev r8, r7
  372. vext.8 ctr1, ctr1, ctr1, #4
  373. add r7, r7, #1
  374. vmov.32 ctr1[1], r8
  375. rev ip, r7
  376. vmov e0, ctr
  377. add r7, r7, #1
  378. vmov.32 ctr1[1], ip
  379. rev r8, r7
  380. vmov e1, ctr
  381. add r7, r7, #1
  382. vmov.32 ctr1[1], r8
  383. rev ip, r7
  384. vmov e2, ctr
  385. add r7, r7, #1
  386. vmov.32 ctr1[1], ip
  387. vmov e3, ctr
  388. add ip, r3, #64
  389. aes_encrypt ip, r6, e0, e1, e2, e3
  390. bx lr
  391. ENDPROC(pmull_aes_encrypt_4x)
  392. pmull_aes_encrypt_final:
  393. add ip, r5, #4
  394. vld1.8 {ctr0}, [r5]
  395. vld1.8 {ctr1}, [ip]
  396. rev r8, r7
  397. vext.8 ctr1, ctr1, ctr1, #4
  398. mov r7, #1 << 24 // BE #1 for the tag
  399. vmov.32 ctr1[1], r8
  400. vmov e0, ctr
  401. vmov.32 ctr1[1], r7
  402. vmov e1, ctr
  403. add ip, r3, #64
  404. aes_encrypt ip, r6, e0, e1
  405. bx lr
  406. ENDPROC(pmull_aes_encrypt_final)
  407. .macro enc_1x, in0
  408. bl pmull_aes_encrypt
  409. veor \in0, \in0, e0
  410. vst1.8 {\in0}, [r4]!
  411. .endm
  412. .macro dec_1x, in0
  413. bl pmull_aes_encrypt
  414. veor e0, e0, \in0
  415. vst1.8 {e0}, [r4]!
  416. .endm
  417. .macro enc_4x, in0, in1, in2, in3
  418. bl pmull_aes_encrypt_4x
  419. veor \in0, \in0, e0
  420. veor \in1, \in1, e1
  421. veor \in2, \in2, e2
  422. veor \in3, \in3, e3
  423. vst1.8 {\in0-\in1}, [r4]!
  424. vst1.8 {\in2-\in3}, [r4]!
  425. .endm
  426. .macro dec_4x, in0, in1, in2, in3
  427. bl pmull_aes_encrypt_4x
  428. veor e0, e0, \in0
  429. veor e1, e1, \in1
  430. veor e2, e2, \in2
  431. veor e3, e3, \in3
  432. vst1.8 {e0-e1}, [r4]!
  433. vst1.8 {e2-e3}, [r4]!
  434. .endm
  435. /*
  436. * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
  437. * struct gcm_key const *k, char *dst,
  438. * char *iv, int rounds, u32 counter)
  439. */
  440. ENTRY(pmull_gcm_encrypt)
  441. push {r4-r8, lr}
  442. ldrd r4, r5, [sp, #24]
  443. ldrd r6, r7, [sp, #32]
  444. vld1.64 {SHASH}, [r3]
  445. ghash_update p64, enc, head=0
  446. vst1.64 {XL}, [r1]
  447. pop {r4-r8, pc}
  448. ENDPROC(pmull_gcm_encrypt)
  449. /*
  450. * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
  451. * struct gcm_key const *k, char *dst,
  452. * char *iv, int rounds, u32 counter)
  453. */
  454. ENTRY(pmull_gcm_decrypt)
  455. push {r4-r8, lr}
  456. ldrd r4, r5, [sp, #24]
  457. ldrd r6, r7, [sp, #32]
  458. vld1.64 {SHASH}, [r3]
  459. ghash_update p64, dec, head=0
  460. vst1.64 {XL}, [r1]
  461. pop {r4-r8, pc}
  462. ENDPROC(pmull_gcm_decrypt)
  463. /*
  464. * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
  465. * struct gcm_key const *k, char *head,
  466. * char *iv, int rounds, u32 counter)
  467. */
  468. ENTRY(pmull_gcm_enc_final)
  469. push {r4-r8, lr}
  470. ldrd r4, r5, [sp, #24]
  471. ldrd r6, r7, [sp, #32]
  472. bl pmull_aes_encrypt_final
  473. cmp r0, #0
  474. beq .Lenc_final
  475. mov_l ip, .Lpermute
  476. sub r4, r4, #16
  477. add r8, ip, r0
  478. add ip, ip, #32
  479. add r4, r4, r0
  480. sub ip, ip, r0
  481. vld1.8 {e3}, [r8] // permute vector for key stream
  482. vld1.8 {e2}, [ip] // permute vector for ghash input
  483. vtbl.8 e3l, {e0}, e3l
  484. vtbl.8 e3h, {e0}, e3h
  485. vld1.8 {e0}, [r4] // encrypt tail block
  486. veor e0, e0, e3
  487. vst1.8 {e0}, [r4]
  488. vtbl.8 T1_L, {e0}, e2l
  489. vtbl.8 T1_H, {e0}, e2h
  490. vld1.64 {XL}, [r1]
  491. .Lenc_final:
  492. vld1.64 {SHASH}, [r3, :128]
  493. vmov.i8 MASK, #0xe1
  494. veor SHASH2_p64, SHASH_L, SHASH_H
  495. vshl.u64 MASK, MASK, #57
  496. mov r0, #1
  497. bne 3f // process head block first
  498. ghash_update p64, aggregate=0, head=0
  499. vrev64.8 XL, XL
  500. vext.8 XL, XL, XL, #8
  501. veor XL, XL, e1
  502. sub r2, r2, #16 // rewind src pointer
  503. vst1.8 {XL}, [r2] // store tag
  504. pop {r4-r8, pc}
  505. ENDPROC(pmull_gcm_enc_final)
  506. /*
  507. * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
  508. * struct gcm_key const *k, char *head,
  509. * char *iv, int rounds, u32 counter,
  510. * const char *otag, int authsize)
  511. */
  512. ENTRY(pmull_gcm_dec_final)
  513. push {r4-r8, lr}
  514. ldrd r4, r5, [sp, #24]
  515. ldrd r6, r7, [sp, #32]
  516. bl pmull_aes_encrypt_final
  517. cmp r0, #0
  518. beq .Ldec_final
  519. mov_l ip, .Lpermute
  520. sub r4, r4, #16
  521. add r8, ip, r0
  522. add ip, ip, #32
  523. add r4, r4, r0
  524. sub ip, ip, r0
  525. vld1.8 {e3}, [r8] // permute vector for key stream
  526. vld1.8 {e2}, [ip] // permute vector for ghash input
  527. vtbl.8 e3l, {e0}, e3l
  528. vtbl.8 e3h, {e0}, e3h
  529. vld1.8 {e0}, [r4]
  530. vtbl.8 T1_L, {e0}, e2l
  531. vtbl.8 T1_H, {e0}, e2h
  532. veor e0, e0, e3
  533. vst1.8 {e0}, [r4]
  534. vld1.64 {XL}, [r1]
  535. .Ldec_final:
  536. vld1.64 {SHASH}, [r3]
  537. vmov.i8 MASK, #0xe1
  538. veor SHASH2_p64, SHASH_L, SHASH_H
  539. vshl.u64 MASK, MASK, #57
  540. mov r0, #1
  541. bne 3f // process head block first
  542. ghash_update p64, aggregate=0, head=0
  543. vrev64.8 XL, XL
  544. vext.8 XL, XL, XL, #8
  545. veor XL, XL, e1
  546. mov_l ip, .Lpermute
  547. ldrd r2, r3, [sp, #40] // otag and authsize
  548. vld1.8 {T1}, [r2]
  549. add ip, ip, r3
  550. vceq.i8 T1, T1, XL // compare tags
  551. vmvn T1, T1 // 0 for eq, -1 for ne
  552. vld1.8 {e0}, [ip]
  553. vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
  554. vtbl.8 XL_H, {T1}, e0h
  555. vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
  556. vpmin.s8 XL_L, XL_L, XL_L
  557. vmov.32 r0, XL_L[0] // fail if != 0x0
  558. pop {r4-r8, pc}
  559. ENDPROC(pmull_gcm_dec_final)
  560. .section ".rodata", "a", %progbits
  561. .align 5
  562. .Lpermute:
  563. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  564. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  565. .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  566. .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  567. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  568. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff