polyval-ce-core.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Implementation of POLYVAL using ARMv8 Crypto Extensions.
  4. *
  5. * Copyright 2021 Google LLC
  6. */
  7. /*
  8. * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
  9. * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
  10. * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
  11. * finite field multiplication into two steps.
  12. *
  13. * In the first step, we consider h^i, m_i as normal polynomials of degree less
  14. * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
  15. * is simply polynomial multiplication.
  16. *
  17. * In the second step, we compute the reduction of p(x) modulo the finite field
  18. * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
  19. *
  20. * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
  21. * multiplication is finite field multiplication. The advantage is that the
  22. * two-step process only requires 1 finite field reduction for every 8
  23. * polynomial multiplications. Further parallelism is gained by interleaving the
  24. * multiplications and polynomial reductions.
  25. */
  26. #include <linux/linkage.h>
  27. #define STRIDE_BLOCKS 8
  28. KEY_POWERS .req x0
  29. MSG .req x1
  30. BLOCKS_LEFT .req x2
  31. ACCUMULATOR .req x3
  32. KEY_START .req x10
  33. EXTRA_BYTES .req x11
  34. TMP .req x13
  35. M0 .req v0
  36. M1 .req v1
  37. M2 .req v2
  38. M3 .req v3
  39. M4 .req v4
  40. M5 .req v5
  41. M6 .req v6
  42. M7 .req v7
  43. KEY8 .req v8
  44. KEY7 .req v9
  45. KEY6 .req v10
  46. KEY5 .req v11
  47. KEY4 .req v12
  48. KEY3 .req v13
  49. KEY2 .req v14
  50. KEY1 .req v15
  51. PL .req v16
  52. PH .req v17
  53. TMP_V .req v18
  54. LO .req v20
  55. MI .req v21
  56. HI .req v22
  57. SUM .req v23
  58. GSTAR .req v24
  59. .text
  60. .arch armv8-a+crypto
  61. .align 4
  62. .Lgstar:
  63. .quad 0xc200000000000000, 0xc200000000000000
  64. /*
  65. * Computes the product of two 128-bit polynomials in X and Y and XORs the
  66. * components of the 256-bit product into LO, MI, HI.
  67. *
  68. * Given:
  69. * X = [X_1 : X_0]
  70. * Y = [Y_1 : Y_0]
  71. *
  72. * We compute:
  73. * LO += X_0 * Y_0
  74. * MI += (X_0 + X_1) * (Y_0 + Y_1)
  75. * HI += X_1 * Y_1
  76. *
  77. * Later, the 256-bit result can be extracted as:
  78. * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
  79. * This step is done when computing the polynomial reduction for efficiency
  80. * reasons.
  81. *
  82. * Karatsuba multiplication is used instead of Schoolbook multiplication because
  83. * it was found to be slightly faster on ARM64 CPUs.
  84. *
  85. */
  86. .macro karatsuba1 X Y
  87. X .req \X
  88. Y .req \Y
  89. ext v25.16b, X.16b, X.16b, #8
  90. ext v26.16b, Y.16b, Y.16b, #8
  91. eor v25.16b, v25.16b, X.16b
  92. eor v26.16b, v26.16b, Y.16b
  93. pmull2 v28.1q, X.2d, Y.2d
  94. pmull v29.1q, X.1d, Y.1d
  95. pmull v27.1q, v25.1d, v26.1d
  96. eor HI.16b, HI.16b, v28.16b
  97. eor LO.16b, LO.16b, v29.16b
  98. eor MI.16b, MI.16b, v27.16b
  99. .unreq X
  100. .unreq Y
  101. .endm
  102. /*
  103. * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
  104. * them.
  105. */
  106. .macro karatsuba1_store X Y
  107. X .req \X
  108. Y .req \Y
  109. ext v25.16b, X.16b, X.16b, #8
  110. ext v26.16b, Y.16b, Y.16b, #8
  111. eor v25.16b, v25.16b, X.16b
  112. eor v26.16b, v26.16b, Y.16b
  113. pmull2 HI.1q, X.2d, Y.2d
  114. pmull LO.1q, X.1d, Y.1d
  115. pmull MI.1q, v25.1d, v26.1d
  116. .unreq X
  117. .unreq Y
  118. .endm
  119. /*
  120. * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
  121. * the result in PL, PH.
  122. * [PH : PL] =
  123. * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
  124. */
  125. .macro karatsuba2
  126. // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
  127. eor v4.16b, HI.16b, MI.16b
  128. // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
  129. eor v4.16b, v4.16b, LO.16b
  130. // v5 = [HI_0 : LO_1]
  131. ext v5.16b, LO.16b, HI.16b, #8
  132. // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
  133. eor v4.16b, v4.16b, v5.16b
  134. // HI = [HI_0 : HI_1]
  135. ext HI.16b, HI.16b, HI.16b, #8
  136. // LO = [LO_0 : LO_1]
  137. ext LO.16b, LO.16b, LO.16b, #8
  138. // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
  139. ext PH.16b, v4.16b, HI.16b, #8
  140. // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
  141. ext PL.16b, LO.16b, v4.16b, #8
  142. .endm
  143. /*
  144. * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
  145. *
  146. * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
  147. * x^128 + x^127 + x^126 + x^121 + 1.
  148. *
  149. * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
  150. * product of two 128-bit polynomials in Montgomery form. We need to reduce it
  151. * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
  152. * of x^128, this product has two extra factors of x^128. To get it back into
  153. * Montgomery form, we need to remove one of these factors by dividing by x^128.
  154. *
  155. * To accomplish both of these goals, we add multiples of g(x) that cancel out
  156. * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
  157. * bits are zero, the polynomial division by x^128 can be done by right
  158. * shifting.
  159. *
  160. * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
  161. * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
  162. * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
  163. * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
  164. * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
  165. * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
  166. *
  167. * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
  168. * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
  169. * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
  170. * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
  171. * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
  172. *
  173. * So our final computation is:
  174. * T = T_1 : T_0 = g*(x) * P_0
  175. * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
  176. * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
  177. *
  178. * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
  179. * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
  180. * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
  181. */
  182. .macro montgomery_reduction dest
  183. DEST .req \dest
  184. // TMP_V = T_1 : T_0 = P_0 * g*(x)
  185. pmull TMP_V.1q, PL.1d, GSTAR.1d
  186. // TMP_V = T_0 : T_1
  187. ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
  188. // TMP_V = P_1 + T_0 : P_0 + T_1
  189. eor TMP_V.16b, PL.16b, TMP_V.16b
  190. // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
  191. eor PH.16b, PH.16b, TMP_V.16b
  192. // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
  193. pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
  194. eor DEST.16b, PH.16b, TMP_V.16b
  195. .unreq DEST
  196. .endm
  197. /*
  198. * Compute Polyval on 8 blocks.
  199. *
  200. * If reduce is set, also computes the montgomery reduction of the
  201. * previous full_stride call and XORs with the first message block.
  202. * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
  203. * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
  204. *
  205. * Sets PL, PH.
  206. */
  207. .macro full_stride reduce
  208. eor LO.16b, LO.16b, LO.16b
  209. eor MI.16b, MI.16b, MI.16b
  210. eor HI.16b, HI.16b, HI.16b
  211. ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
  212. ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
  213. karatsuba1 M7 KEY1
  214. .if \reduce
  215. pmull TMP_V.1q, PL.1d, GSTAR.1d
  216. .endif
  217. karatsuba1 M6 KEY2
  218. .if \reduce
  219. ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
  220. .endif
  221. karatsuba1 M5 KEY3
  222. .if \reduce
  223. eor TMP_V.16b, PL.16b, TMP_V.16b
  224. .endif
  225. karatsuba1 M4 KEY4
  226. .if \reduce
  227. eor PH.16b, PH.16b, TMP_V.16b
  228. .endif
  229. karatsuba1 M3 KEY5
  230. .if \reduce
  231. pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
  232. .endif
  233. karatsuba1 M2 KEY6
  234. .if \reduce
  235. eor SUM.16b, PH.16b, TMP_V.16b
  236. .endif
  237. karatsuba1 M1 KEY7
  238. eor M0.16b, M0.16b, SUM.16b
  239. karatsuba1 M0 KEY8
  240. karatsuba2
  241. .endm
  242. /*
  243. * Handle any extra blocks after full_stride loop.
  244. */
  245. .macro partial_stride
  246. add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
  247. sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
  248. ld1 {KEY1.16b}, [KEY_POWERS], #16
  249. ld1 {TMP_V.16b}, [MSG], #16
  250. eor SUM.16b, SUM.16b, TMP_V.16b
  251. karatsuba1_store KEY1 SUM
  252. sub BLOCKS_LEFT, BLOCKS_LEFT, #1
  253. tst BLOCKS_LEFT, #4
  254. beq .Lpartial4BlocksDone
  255. ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
  256. ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
  257. karatsuba1 M0 KEY8
  258. karatsuba1 M1 KEY7
  259. karatsuba1 M2 KEY6
  260. karatsuba1 M3 KEY5
  261. .Lpartial4BlocksDone:
  262. tst BLOCKS_LEFT, #2
  263. beq .Lpartial2BlocksDone
  264. ld1 {M0.16b, M1.16b}, [MSG], #32
  265. ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
  266. karatsuba1 M0 KEY8
  267. karatsuba1 M1 KEY7
  268. .Lpartial2BlocksDone:
  269. tst BLOCKS_LEFT, #1
  270. beq .LpartialDone
  271. ld1 {M0.16b}, [MSG], #16
  272. ld1 {KEY8.16b}, [KEY_POWERS], #16
  273. karatsuba1 M0 KEY8
  274. .LpartialDone:
  275. karatsuba2
  276. montgomery_reduction SUM
  277. .endm
  278. /*
  279. * Perform montgomery multiplication in GF(2^128) and store result in op1.
  280. *
  281. * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
  282. * If op1, op2 are in montgomery form, this computes the montgomery
  283. * form of op1*op2.
  284. *
  285. * void pmull_polyval_mul(u8 *op1, const u8 *op2);
  286. */
  287. SYM_FUNC_START(pmull_polyval_mul)
  288. adr TMP, .Lgstar
  289. ld1 {GSTAR.2d}, [TMP]
  290. ld1 {v0.16b}, [x0]
  291. ld1 {v1.16b}, [x1]
  292. karatsuba1_store v0 v1
  293. karatsuba2
  294. montgomery_reduction SUM
  295. st1 {SUM.16b}, [x0]
  296. ret
  297. SYM_FUNC_END(pmull_polyval_mul)
  298. /*
  299. * Perform polynomial evaluation as specified by POLYVAL. This computes:
  300. * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
  301. * where n=nblocks, h is the hash key, and m_i are the message blocks.
  302. *
  303. * x0 - pointer to precomputed key powers h^8 ... h^1
  304. * x1 - pointer to message blocks
  305. * x2 - number of blocks to hash
  306. * x3 - pointer to accumulator
  307. *
  308. * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
  309. * size_t nblocks, u8 *accumulator);
  310. */
  311. SYM_FUNC_START(pmull_polyval_update)
  312. adr TMP, .Lgstar
  313. mov KEY_START, KEY_POWERS
  314. ld1 {GSTAR.2d}, [TMP]
  315. ld1 {SUM.16b}, [ACCUMULATOR]
  316. subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
  317. blt .LstrideLoopExit
  318. ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
  319. ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
  320. full_stride 0
  321. subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
  322. blt .LstrideLoopExitReduce
  323. .LstrideLoop:
  324. full_stride 1
  325. subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
  326. bge .LstrideLoop
  327. .LstrideLoopExitReduce:
  328. montgomery_reduction SUM
  329. .LstrideLoopExit:
  330. adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
  331. beq .LskipPartial
  332. partial_stride
  333. .LskipPartial:
  334. st1 {SUM.16b}, [ACCUMULATOR]
  335. ret
  336. SYM_FUNC_END(pmull_polyval_update)