crct10dif-ce-core.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. //
  2. // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
  3. //
  4. // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. // Copyright (C) 2019 Google LLC <ebiggers@google.com>
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License version 2 as
  9. // published by the Free Software Foundation.
  10. //
  11. // Derived from the x86 version:
  12. //
  13. // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  14. //
  15. // Copyright (c) 2013, Intel Corporation
  16. //
  17. // Authors:
  18. // Erdinc Ozturk <erdinc.ozturk@intel.com>
  19. // Vinodh Gopal <vinodh.gopal@intel.com>
  20. // James Guilford <james.guilford@intel.com>
  21. // Tim Chen <tim.c.chen@linux.intel.com>
  22. //
  23. // This software is available to you under a choice of one of two
  24. // licenses. You may choose to be licensed under the terms of the GNU
  25. // General Public License (GPL) Version 2, available from the file
  26. // COPYING in the main directory of this source tree, or the
  27. // OpenIB.org BSD license below:
  28. //
  29. // Redistribution and use in source and binary forms, with or without
  30. // modification, are permitted provided that the following conditions are
  31. // met:
  32. //
  33. // * Redistributions of source code must retain the above copyright
  34. // notice, this list of conditions and the following disclaimer.
  35. //
  36. // * Redistributions in binary form must reproduce the above copyright
  37. // notice, this list of conditions and the following disclaimer in the
  38. // documentation and/or other materials provided with the
  39. // distribution.
  40. //
  41. // * Neither the name of the Intel Corporation nor the names of its
  42. // contributors may be used to endorse or promote products derived from
  43. // this software without specific prior written permission.
  44. //
  45. //
  46. // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  47. // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  49. // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  50. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  51. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  52. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  53. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  54. // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  55. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  56. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  57. //
  58. // Reference paper titled "Fast CRC Computation for Generic
  59. // Polynomials Using PCLMULQDQ Instruction"
  60. // URL: http://www.intel.com/content/dam/www/public/us/en/documents
  61. // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  62. //
  63. #include <linux/linkage.h>
  64. #include <asm/assembler.h>
  65. .text
  66. .arch armv8-a+crypto
  67. init_crc .req w0
  68. buf .req x1
  69. len .req x2
  70. fold_consts_ptr .req x3
  71. fold_consts .req v10
  72. ad .req v14
  73. k00_16 .req v15
  74. k32_48 .req v16
  75. t3 .req v17
  76. t4 .req v18
  77. t5 .req v19
  78. t6 .req v20
  79. t7 .req v21
  80. t8 .req v22
  81. t9 .req v23
  82. perm1 .req v24
  83. perm2 .req v25
  84. perm3 .req v26
  85. perm4 .req v27
  86. bd1 .req v28
  87. bd2 .req v29
  88. bd3 .req v30
  89. bd4 .req v31
  90. .macro __pmull_init_p64
  91. .endm
  92. .macro __pmull_pre_p64, bd
  93. .endm
  94. .macro __pmull_init_p8
  95. // k00_16 := 0x0000000000000000_000000000000ffff
  96. // k32_48 := 0x00000000ffffffff_0000ffffffffffff
  97. movi k32_48.2d, #0xffffffff
  98. mov k32_48.h[2], k32_48.h[0]
  99. ushr k00_16.2d, k32_48.2d, #32
  100. // prepare the permutation vectors
  101. mov_q x5, 0x080f0e0d0c0b0a09
  102. movi perm4.8b, #8
  103. dup perm1.2d, x5
  104. eor perm1.16b, perm1.16b, perm4.16b
  105. ushr perm2.2d, perm1.2d, #8
  106. ushr perm3.2d, perm1.2d, #16
  107. ushr perm4.2d, perm1.2d, #24
  108. sli perm2.2d, perm1.2d, #56
  109. sli perm3.2d, perm1.2d, #48
  110. sli perm4.2d, perm1.2d, #40
  111. .endm
  112. .macro __pmull_pre_p8, bd
  113. tbl bd1.16b, {\bd\().16b}, perm1.16b
  114. tbl bd2.16b, {\bd\().16b}, perm2.16b
  115. tbl bd3.16b, {\bd\().16b}, perm3.16b
  116. tbl bd4.16b, {\bd\().16b}, perm4.16b
  117. .endm
  118. SYM_FUNC_START_LOCAL(__pmull_p8_core)
  119. .L__pmull_p8_core:
  120. ext t4.8b, ad.8b, ad.8b, #1 // A1
  121. ext t5.8b, ad.8b, ad.8b, #2 // A2
  122. ext t6.8b, ad.8b, ad.8b, #3 // A3
  123. pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
  124. pmull t8.8h, ad.8b, bd1.8b // E = A*B1
  125. pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
  126. pmull t7.8h, ad.8b, bd2.8b // G = A*B2
  127. pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
  128. pmull t9.8h, ad.8b, bd3.8b // I = A*B3
  129. pmull t3.8h, ad.8b, bd4.8b // K = A*B4
  130. b 0f
  131. .L__pmull_p8_core2:
  132. tbl t4.16b, {ad.16b}, perm1.16b // A1
  133. tbl t5.16b, {ad.16b}, perm2.16b // A2
  134. tbl t6.16b, {ad.16b}, perm3.16b // A3
  135. pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
  136. pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
  137. pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
  138. pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
  139. pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
  140. pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
  141. pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
  142. 0: eor t4.16b, t4.16b, t8.16b // L = E + F
  143. eor t5.16b, t5.16b, t7.16b // M = G + H
  144. eor t6.16b, t6.16b, t9.16b // N = I + J
  145. uzp1 t8.2d, t4.2d, t5.2d
  146. uzp2 t4.2d, t4.2d, t5.2d
  147. uzp1 t7.2d, t6.2d, t3.2d
  148. uzp2 t6.2d, t6.2d, t3.2d
  149. // t4 = (L) (P0 + P1) << 8
  150. // t5 = (M) (P2 + P3) << 16
  151. eor t8.16b, t8.16b, t4.16b
  152. and t4.16b, t4.16b, k32_48.16b
  153. // t6 = (N) (P4 + P5) << 24
  154. // t7 = (K) (P6 + P7) << 32
  155. eor t7.16b, t7.16b, t6.16b
  156. and t6.16b, t6.16b, k00_16.16b
  157. eor t8.16b, t8.16b, t4.16b
  158. eor t7.16b, t7.16b, t6.16b
  159. zip2 t5.2d, t8.2d, t4.2d
  160. zip1 t4.2d, t8.2d, t4.2d
  161. zip2 t3.2d, t7.2d, t6.2d
  162. zip1 t6.2d, t7.2d, t6.2d
  163. ext t4.16b, t4.16b, t4.16b, #15
  164. ext t5.16b, t5.16b, t5.16b, #14
  165. ext t6.16b, t6.16b, t6.16b, #13
  166. ext t3.16b, t3.16b, t3.16b, #12
  167. eor t4.16b, t4.16b, t5.16b
  168. eor t6.16b, t6.16b, t3.16b
  169. ret
  170. SYM_FUNC_END(__pmull_p8_core)
  171. .macro __pmull_p8, rq, ad, bd, i
  172. .ifnc \bd, fold_consts
  173. .err
  174. .endif
  175. mov ad.16b, \ad\().16b
  176. .ifb \i
  177. pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
  178. .else
  179. pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
  180. .endif
  181. bl .L__pmull_p8_core\i
  182. eor \rq\().16b, \rq\().16b, t4.16b
  183. eor \rq\().16b, \rq\().16b, t6.16b
  184. .endm
  185. // Fold reg1, reg2 into the next 32 data bytes, storing the result back
  186. // into reg1, reg2.
  187. .macro fold_32_bytes, p, reg1, reg2
  188. ldp q11, q12, [buf], #0x20
  189. __pmull_\p v8, \reg1, fold_consts, 2
  190. __pmull_\p \reg1, \reg1, fold_consts
  191. CPU_LE( rev64 v11.16b, v11.16b )
  192. CPU_LE( rev64 v12.16b, v12.16b )
  193. __pmull_\p v9, \reg2, fold_consts, 2
  194. __pmull_\p \reg2, \reg2, fold_consts
  195. CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
  196. CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
  197. eor \reg1\().16b, \reg1\().16b, v8.16b
  198. eor \reg2\().16b, \reg2\().16b, v9.16b
  199. eor \reg1\().16b, \reg1\().16b, v11.16b
  200. eor \reg2\().16b, \reg2\().16b, v12.16b
  201. .endm
  202. // Fold src_reg into dst_reg, optionally loading the next fold constants
  203. .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
  204. __pmull_\p v8, \src_reg, fold_consts
  205. __pmull_\p \src_reg, \src_reg, fold_consts, 2
  206. .ifnb \load_next_consts
  207. ld1 {fold_consts.2d}, [fold_consts_ptr], #16
  208. __pmull_pre_\p fold_consts
  209. .endif
  210. eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
  211. eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
  212. .endm
  213. .macro __pmull_p64, rd, rn, rm, n
  214. .ifb \n
  215. pmull \rd\().1q, \rn\().1d, \rm\().1d
  216. .else
  217. pmull2 \rd\().1q, \rn\().2d, \rm\().2d
  218. .endif
  219. .endm
  220. .macro crc_t10dif_pmull, p
  221. __pmull_init_\p
  222. // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
  223. cmp len, #256
  224. b.lt .Lless_than_256_bytes_\@
  225. adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
  226. // Load the first 128 data bytes. Byte swapping is necessary to make
  227. // the bit order match the polynomial coefficient order.
  228. ldp q0, q1, [buf]
  229. ldp q2, q3, [buf, #0x20]
  230. ldp q4, q5, [buf, #0x40]
  231. ldp q6, q7, [buf, #0x60]
  232. add buf, buf, #0x80
  233. CPU_LE( rev64 v0.16b, v0.16b )
  234. CPU_LE( rev64 v1.16b, v1.16b )
  235. CPU_LE( rev64 v2.16b, v2.16b )
  236. CPU_LE( rev64 v3.16b, v3.16b )
  237. CPU_LE( rev64 v4.16b, v4.16b )
  238. CPU_LE( rev64 v5.16b, v5.16b )
  239. CPU_LE( rev64 v6.16b, v6.16b )
  240. CPU_LE( rev64 v7.16b, v7.16b )
  241. CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
  242. CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
  243. CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
  244. CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
  245. CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
  246. CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
  247. CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
  248. CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
  249. // XOR the first 16 data *bits* with the initial CRC value.
  250. movi v8.16b, #0
  251. mov v8.h[7], init_crc
  252. eor v0.16b, v0.16b, v8.16b
  253. // Load the constants for folding across 128 bytes.
  254. ld1 {fold_consts.2d}, [fold_consts_ptr]
  255. __pmull_pre_\p fold_consts
  256. // Subtract 128 for the 128 data bytes just consumed. Subtract another
  257. // 128 to simplify the termination condition of the following loop.
  258. sub len, len, #256
  259. // While >= 128 data bytes remain (not counting v0-v7), fold the 128
  260. // bytes v0-v7 into them, storing the result back into v0-v7.
  261. .Lfold_128_bytes_loop_\@:
  262. fold_32_bytes \p, v0, v1
  263. fold_32_bytes \p, v2, v3
  264. fold_32_bytes \p, v4, v5
  265. fold_32_bytes \p, v6, v7
  266. subs len, len, #128
  267. b.ge .Lfold_128_bytes_loop_\@
  268. // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
  269. // Fold across 64 bytes.
  270. add fold_consts_ptr, fold_consts_ptr, #16
  271. ld1 {fold_consts.2d}, [fold_consts_ptr], #16
  272. __pmull_pre_\p fold_consts
  273. fold_16_bytes \p, v0, v4
  274. fold_16_bytes \p, v1, v5
  275. fold_16_bytes \p, v2, v6
  276. fold_16_bytes \p, v3, v7, 1
  277. // Fold across 32 bytes.
  278. fold_16_bytes \p, v4, v6
  279. fold_16_bytes \p, v5, v7, 1
  280. // Fold across 16 bytes.
  281. fold_16_bytes \p, v6, v7
  282. // Add 128 to get the correct number of data bytes remaining in 0...127
  283. // (not counting v7), following the previous extra subtraction by 128.
  284. // Then subtract 16 to simplify the termination condition of the
  285. // following loop.
  286. adds len, len, #(128-16)
  287. // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
  288. // into them, storing the result back into v7.
  289. b.lt .Lfold_16_bytes_loop_done_\@
  290. .Lfold_16_bytes_loop_\@:
  291. __pmull_\p v8, v7, fold_consts
  292. __pmull_\p v7, v7, fold_consts, 2
  293. eor v7.16b, v7.16b, v8.16b
  294. ldr q0, [buf], #16
  295. CPU_LE( rev64 v0.16b, v0.16b )
  296. CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
  297. eor v7.16b, v7.16b, v0.16b
  298. subs len, len, #16
  299. b.ge .Lfold_16_bytes_loop_\@
  300. .Lfold_16_bytes_loop_done_\@:
  301. // Add 16 to get the correct number of data bytes remaining in 0...15
  302. // (not counting v7), following the previous extra subtraction by 16.
  303. adds len, len, #16
  304. b.eq .Lreduce_final_16_bytes_\@
  305. .Lhandle_partial_segment_\@:
  306. // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
  307. // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
  308. // do this without needing a fold constant for each possible 'len',
  309. // redivide the bytes into a first chunk of 'len' bytes and a second
  310. // chunk of 16 bytes, then fold the first chunk into the second.
  311. // v0 = last 16 original data bytes
  312. add buf, buf, len
  313. ldr q0, [buf, #-16]
  314. CPU_LE( rev64 v0.16b, v0.16b )
  315. CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
  316. // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
  317. adr_l x4, .Lbyteshift_table + 16
  318. sub x4, x4, len
  319. ld1 {v2.16b}, [x4]
  320. tbl v1.16b, {v7.16b}, v2.16b
  321. // v3 = first chunk: v7 right-shifted by '16-len' bytes.
  322. movi v3.16b, #0x80
  323. eor v2.16b, v2.16b, v3.16b
  324. tbl v3.16b, {v7.16b}, v2.16b
  325. // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
  326. sshr v2.16b, v2.16b, #7
  327. // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
  328. // then '16-len' bytes from v1 (high-order bytes).
  329. bsl v2.16b, v1.16b, v0.16b
  330. // Fold the first chunk into the second chunk, storing the result in v7.
  331. __pmull_\p v0, v3, fold_consts
  332. __pmull_\p v7, v3, fold_consts, 2
  333. eor v7.16b, v7.16b, v0.16b
  334. eor v7.16b, v7.16b, v2.16b
  335. .Lreduce_final_16_bytes_\@:
  336. // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
  337. movi v2.16b, #0 // init zero register
  338. // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
  339. ld1 {fold_consts.2d}, [fold_consts_ptr], #16
  340. __pmull_pre_\p fold_consts
  341. // Fold the high 64 bits into the low 64 bits, while also multiplying by
  342. // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
  343. // whose low 48 bits are 0.
  344. ext v0.16b, v2.16b, v7.16b, #8
  345. __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
  346. eor v0.16b, v0.16b, v7.16b // + low bits * x^64
  347. // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
  348. // value congruent to x^64 * M(x) and whose low 48 bits are 0.
  349. ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
  350. mov v0.s[3], v2.s[0] // zero high 32 bits
  351. __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
  352. eor v0.16b, v0.16b, v1.16b // + low bits
  353. // Load G(x) and floor(x^48 / G(x)).
  354. ld1 {fold_consts.2d}, [fold_consts_ptr]
  355. __pmull_pre_\p fold_consts
  356. // Use Barrett reduction to compute the final CRC value.
  357. __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
  358. ushr v1.2d, v1.2d, #32 // /= x^32
  359. __pmull_\p v1, v1, fold_consts // *= G(x)
  360. ushr v0.2d, v0.2d, #48
  361. eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
  362. // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
  363. umov w0, v0.h[0]
  364. .ifc \p, p8
  365. frame_pop
  366. .endif
  367. ret
  368. .Lless_than_256_bytes_\@:
  369. // Checksumming a buffer of length 16...255 bytes
  370. adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
  371. // Load the first 16 data bytes.
  372. ldr q7, [buf], #0x10
  373. CPU_LE( rev64 v7.16b, v7.16b )
  374. CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
  375. // XOR the first 16 data *bits* with the initial CRC value.
  376. movi v0.16b, #0
  377. mov v0.h[7], init_crc
  378. eor v7.16b, v7.16b, v0.16b
  379. // Load the fold-across-16-bytes constants.
  380. ld1 {fold_consts.2d}, [fold_consts_ptr], #16
  381. __pmull_pre_\p fold_consts
  382. cmp len, #16
  383. b.eq .Lreduce_final_16_bytes_\@ // len == 16
  384. subs len, len, #32
  385. b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
  386. add len, len, #16
  387. b .Lhandle_partial_segment_\@ // 17 <= len <= 31
  388. .endm
  389. //
  390. // u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
  391. //
  392. // Assumes len >= 16.
  393. //
  394. SYM_FUNC_START(crc_t10dif_pmull_p8)
  395. frame_push 1
  396. crc_t10dif_pmull p8
  397. SYM_FUNC_END(crc_t10dif_pmull_p8)
  398. .align 5
  399. //
  400. // u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
  401. //
  402. // Assumes len >= 16.
  403. //
  404. SYM_FUNC_START(crc_t10dif_pmull_p64)
  405. crc_t10dif_pmull p64
  406. SYM_FUNC_END(crc_t10dif_pmull_p64)
  407. .section ".rodata", "a"
  408. .align 4
  409. // Fold constants precomputed from the polynomial 0x18bb7
  410. // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
  411. .Lfold_across_128_bytes_consts:
  412. .quad 0x0000000000006123 // x^(8*128) mod G(x)
  413. .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
  414. // .Lfold_across_64_bytes_consts:
  415. .quad 0x0000000000001069 // x^(4*128) mod G(x)
  416. .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
  417. // .Lfold_across_32_bytes_consts:
  418. .quad 0x000000000000857d // x^(2*128) mod G(x)
  419. .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
  420. .Lfold_across_16_bytes_consts:
  421. .quad 0x000000000000a010 // x^(1*128) mod G(x)
  422. .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
  423. // .Lfinal_fold_consts:
  424. .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
  425. .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
  426. // .Lbarrett_reduction_consts:
  427. .quad 0x0000000000018bb7 // G(x)
  428. .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
  429. // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
  430. // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
  431. // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
  432. .Lbyteshift_table:
  433. .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
  434. .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
  435. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  436. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0