ghash-ce-core.S 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /*
  2. * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
  3. *
  4. * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify it
  7. * under the terms of the GNU General Public License version 2 as published
  8. * by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. SHASH .req q0
  13. T1 .req q1
  14. XL .req q2
  15. XM .req q3
  16. XH .req q4
  17. IN1 .req q4
  18. SHASH_L .req d0
  19. SHASH_H .req d1
  20. T1_L .req d2
  21. T1_H .req d3
  22. XL_L .req d4
  23. XL_H .req d5
  24. XM_L .req d6
  25. XM_H .req d7
  26. XH_L .req d8
  27. t0l .req d10
  28. t0h .req d11
  29. t1l .req d12
  30. t1h .req d13
  31. t2l .req d14
  32. t2h .req d15
  33. t3l .req d16
  34. t3h .req d17
  35. t4l .req d18
  36. t4h .req d19
  37. t0q .req q5
  38. t1q .req q6
  39. t2q .req q7
  40. t3q .req q8
  41. t4q .req q9
  42. T2 .req q9
  43. s1l .req d20
  44. s1h .req d21
  45. s2l .req d22
  46. s2h .req d23
  47. s3l .req d24
  48. s3h .req d25
  49. s4l .req d26
  50. s4h .req d27
  51. MASK .req d28
  52. SHASH2_p8 .req d28
  53. k16 .req d29
  54. k32 .req d30
  55. k48 .req d31
  56. SHASH2_p64 .req d31
  57. .text
  58. .fpu crypto-neon-fp-armv8
  59. .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
  60. vmull.p64 \rd, \rn, \rm
  61. .endm
  62. /*
  63. * This implementation of 64x64 -> 128 bit polynomial multiplication
  64. * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
  65. * "Fast Software Polynomial Multiplication on ARM Processors Using
  66. * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
  67. * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
  68. *
  69. * It has been slightly tweaked for in-order performance, and to allow
  70. * 'rq' to overlap with 'ad' or 'bd'.
  71. */
  72. .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
  73. vext.8 t0l, \ad, \ad, #1 @ A1
  74. .ifc \b1, t4l
  75. vext.8 t4l, \bd, \bd, #1 @ B1
  76. .endif
  77. vmull.p8 t0q, t0l, \bd @ F = A1*B
  78. vext.8 t1l, \ad, \ad, #2 @ A2
  79. vmull.p8 t4q, \ad, \b1 @ E = A*B1
  80. .ifc \b2, t3l
  81. vext.8 t3l, \bd, \bd, #2 @ B2
  82. .endif
  83. vmull.p8 t1q, t1l, \bd @ H = A2*B
  84. vext.8 t2l, \ad, \ad, #3 @ A3
  85. vmull.p8 t3q, \ad, \b2 @ G = A*B2
  86. veor t0q, t0q, t4q @ L = E + F
  87. .ifc \b3, t4l
  88. vext.8 t4l, \bd, \bd, #3 @ B3
  89. .endif
  90. vmull.p8 t2q, t2l, \bd @ J = A3*B
  91. veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
  92. veor t1q, t1q, t3q @ M = G + H
  93. .ifc \b4, t3l
  94. vext.8 t3l, \bd, \bd, #4 @ B4
  95. .endif
  96. vmull.p8 t4q, \ad, \b3 @ I = A*B3
  97. veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
  98. vmull.p8 t3q, \ad, \b4 @ K = A*B4
  99. vand t0h, t0h, k48
  100. vand t1h, t1h, k32
  101. veor t2q, t2q, t4q @ N = I + J
  102. veor t0l, t0l, t0h
  103. veor t1l, t1l, t1h
  104. veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
  105. vand t2h, t2h, k16
  106. veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
  107. vmov.i64 t3h, #0
  108. vext.8 t0q, t0q, t0q, #15
  109. veor t2l, t2l, t2h
  110. vext.8 t1q, t1q, t1q, #14
  111. vmull.p8 \rq, \ad, \bd @ D = A*B
  112. vext.8 t2q, t2q, t2q, #13
  113. vext.8 t3q, t3q, t3q, #12
  114. veor t0q, t0q, t1q
  115. veor t2q, t2q, t3q
  116. veor \rq, \rq, t0q
  117. veor \rq, \rq, t2q
  118. .endm
  119. //
  120. // PMULL (64x64->128) based reduction for CPUs that can do
  121. // it in a single instruction.
  122. //
  123. .macro __pmull_reduce_p64
  124. vmull.p64 T1, XL_L, MASK
  125. veor XH_L, XH_L, XM_H
  126. vext.8 T1, T1, T1, #8
  127. veor XL_H, XL_H, XM_L
  128. veor T1, T1, XL
  129. vmull.p64 XL, T1_H, MASK
  130. .endm
  131. //
  132. // Alternative reduction for CPUs that lack support for the
  133. // 64x64->128 PMULL instruction
  134. //
  135. .macro __pmull_reduce_p8
  136. veor XL_H, XL_H, XM_L
  137. veor XH_L, XH_L, XM_H
  138. vshl.i64 T1, XL, #57
  139. vshl.i64 T2, XL, #62
  140. veor T1, T1, T2
  141. vshl.i64 T2, XL, #63
  142. veor T1, T1, T2
  143. veor XL_H, XL_H, T1_L
  144. veor XH_L, XH_L, T1_H
  145. vshr.u64 T1, XL, #1
  146. veor XH, XH, XL
  147. veor XL, XL, T1
  148. vshr.u64 T1, T1, #6
  149. vshr.u64 XL, XL, #1
  150. .endm
  151. .macro ghash_update, pn
  152. vld1.64 {XL}, [r1]
  153. /* do the head block first, if supplied */
  154. ldr ip, [sp]
  155. teq ip, #0
  156. beq 0f
  157. vld1.64 {T1}, [ip]
  158. teq r0, #0
  159. b 1f
  160. 0: vld1.64 {T1}, [r2]!
  161. subs r0, r0, #1
  162. 1: /* multiply XL by SHASH in GF(2^128) */
  163. #ifndef CONFIG_CPU_BIG_ENDIAN
  164. vrev64.8 T1, T1
  165. #endif
  166. vext.8 IN1, T1, T1, #8
  167. veor T1_L, T1_L, XL_H
  168. veor XL, XL, IN1
  169. __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
  170. veor T1, T1, XL
  171. __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
  172. __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
  173. veor T1, XL, XH
  174. veor XM, XM, T1
  175. __pmull_reduce_\pn
  176. veor T1, T1, XH
  177. veor XL, XL, T1
  178. bne 0b
  179. vst1.64 {XL}, [r1]
  180. bx lr
  181. .endm
  182. /*
  183. * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
  184. * struct ghash_key const *k, const char *head)
  185. */
  186. ENTRY(pmull_ghash_update_p64)
  187. vld1.64 {SHASH}, [r3]
  188. veor SHASH2_p64, SHASH_L, SHASH_H
  189. vmov.i8 MASK, #0xe1
  190. vshl.u64 MASK, MASK, #57
  191. ghash_update p64
  192. ENDPROC(pmull_ghash_update_p64)
  193. ENTRY(pmull_ghash_update_p8)
  194. vld1.64 {SHASH}, [r3]
  195. veor SHASH2_p8, SHASH_L, SHASH_H
  196. vext.8 s1l, SHASH_L, SHASH_L, #1
  197. vext.8 s2l, SHASH_L, SHASH_L, #2
  198. vext.8 s3l, SHASH_L, SHASH_L, #3
  199. vext.8 s4l, SHASH_L, SHASH_L, #4
  200. vext.8 s1h, SHASH_H, SHASH_H, #1
  201. vext.8 s2h, SHASH_H, SHASH_H, #2
  202. vext.8 s3h, SHASH_H, SHASH_H, #3
  203. vext.8 s4h, SHASH_H, SHASH_H, #4
  204. vmov.i64 k16, #0xffff
  205. vmov.i64 k32, #0xffffffff
  206. vmov.i64 k48, #0xffffffffffff
  207. ghash_update p8
  208. ENDPROC(pmull_ghash_update_p8)