crct10dif-ce-core.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. //
  2. // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
  3. //
  4. // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. //
  6. // This program is free software; you can redistribute it and/or modify
  7. // it under the terms of the GNU General Public License version 2 as
  8. // published by the Free Software Foundation.
  9. //
  10. //
  11. // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  12. //
  13. // Copyright (c) 2013, Intel Corporation
  14. //
  15. // Authors:
  16. // Erdinc Ozturk <erdinc.ozturk@intel.com>
  17. // Vinodh Gopal <vinodh.gopal@intel.com>
  18. // James Guilford <james.guilford@intel.com>
  19. // Tim Chen <tim.c.chen@linux.intel.com>
  20. //
  21. // This software is available to you under a choice of one of two
  22. // licenses. You may choose to be licensed under the terms of the GNU
  23. // General Public License (GPL) Version 2, available from the file
  24. // COPYING in the main directory of this source tree, or the
  25. // OpenIB.org BSD license below:
  26. //
  27. // Redistribution and use in source and binary forms, with or without
  28. // modification, are permitted provided that the following conditions are
  29. // met:
  30. //
  31. // * Redistributions of source code must retain the above copyright
  32. // notice, this list of conditions and the following disclaimer.
  33. //
  34. // * Redistributions in binary form must reproduce the above copyright
  35. // notice, this list of conditions and the following disclaimer in the
  36. // documentation and/or other materials provided with the
  37. // distribution.
  38. //
  39. // * Neither the name of the Intel Corporation nor the names of its
  40. // contributors may be used to endorse or promote products derived from
  41. // this software without specific prior written permission.
  42. //
  43. //
  44. // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  45. // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  46. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  47. // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  48. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  49. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  50. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  51. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  52. // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  53. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  54. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55. //
  56. // Function API:
  57. // UINT16 crc_t10dif_pcl(
  58. // UINT16 init_crc, //initial CRC value, 16 bits
  59. // const unsigned char *buf, //buffer pointer to calculate CRC on
  60. // UINT64 len //buffer length in bytes (64-bit data)
  61. // );
  62. //
  63. // Reference paper titled "Fast CRC Computation for Generic
  64. // Polynomials Using PCLMULQDQ Instruction"
  65. // URL: http://www.intel.com/content/dam/www/public/us/en/documents
  66. // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  67. //
  68. //
  69. #include <linux/linkage.h>
  70. #include <asm/assembler.h>
  71. #ifdef CONFIG_CPU_ENDIAN_BE8
  72. #define CPU_LE(code...)
  73. #else
  74. #define CPU_LE(code...) code
  75. #endif
  76. .text
  77. .fpu crypto-neon-fp-armv8
  78. arg1_low32 .req r0
  79. arg2 .req r1
  80. arg3 .req r2
  81. qzr .req q13
  82. q0l .req d0
  83. q0h .req d1
  84. q1l .req d2
  85. q1h .req d3
  86. q2l .req d4
  87. q2h .req d5
  88. q3l .req d6
  89. q3h .req d7
  90. q4l .req d8
  91. q4h .req d9
  92. q5l .req d10
  93. q5h .req d11
  94. q6l .req d12
  95. q6h .req d13
  96. q7l .req d14
  97. q7h .req d15
  98. ENTRY(crc_t10dif_pmull)
  99. vmov.i8 qzr, #0 // init zero register
  100. // adjust the 16-bit initial_crc value, scale it to 32 bits
  101. lsl arg1_low32, arg1_low32, #16
  102. // check if smaller than 256
  103. cmp arg3, #256
  104. // for sizes less than 128, we can't fold 64B at a time...
  105. blt _less_than_128
  106. // load the initial crc value
  107. // crc value does not need to be byte-reflected, but it needs
  108. // to be moved to the high part of the register.
  109. // because data will be byte-reflected and will align with
  110. // initial crc at correct place.
  111. vmov s0, arg1_low32 // initial crc
  112. vext.8 q10, qzr, q0, #4
  113. // receive the initial 64B data, xor the initial crc value
  114. vld1.64 {q0-q1}, [arg2]!
  115. vld1.64 {q2-q3}, [arg2]!
  116. vld1.64 {q4-q5}, [arg2]!
  117. vld1.64 {q6-q7}, [arg2]!
  118. CPU_LE( vrev64.8 q0, q0 )
  119. CPU_LE( vrev64.8 q1, q1 )
  120. CPU_LE( vrev64.8 q2, q2 )
  121. CPU_LE( vrev64.8 q3, q3 )
  122. CPU_LE( vrev64.8 q4, q4 )
  123. CPU_LE( vrev64.8 q5, q5 )
  124. CPU_LE( vrev64.8 q6, q6 )
  125. CPU_LE( vrev64.8 q7, q7 )
  126. vswp d0, d1
  127. vswp d2, d3
  128. vswp d4, d5
  129. vswp d6, d7
  130. vswp d8, d9
  131. vswp d10, d11
  132. vswp d12, d13
  133. vswp d14, d15
  134. // XOR the initial_crc value
  135. veor.8 q0, q0, q10
  136. adr ip, rk3
  137. vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
  138. //
  139. // we subtract 256 instead of 128 to save one instruction from the loop
  140. //
  141. sub arg3, arg3, #256
  142. // at this section of the code, there is 64*x+y (0<=y<64) bytes of
  143. // buffer. The _fold_64_B_loop will fold 64B at a time
  144. // until we have 64+y Bytes of buffer
  145. // fold 64B at a time. This section of the code folds 4 vector
  146. // registers in parallel
  147. _fold_64_B_loop:
  148. .macro fold64, reg1, reg2
  149. vld1.64 {q11-q12}, [arg2]!
  150. vmull.p64 q8, \reg1\()h, d21
  151. vmull.p64 \reg1, \reg1\()l, d20
  152. vmull.p64 q9, \reg2\()h, d21
  153. vmull.p64 \reg2, \reg2\()l, d20
  154. CPU_LE( vrev64.8 q11, q11 )
  155. CPU_LE( vrev64.8 q12, q12 )
  156. vswp d22, d23
  157. vswp d24, d25
  158. veor.8 \reg1, \reg1, q8
  159. veor.8 \reg2, \reg2, q9
  160. veor.8 \reg1, \reg1, q11
  161. veor.8 \reg2, \reg2, q12
  162. .endm
  163. fold64 q0, q1
  164. fold64 q2, q3
  165. fold64 q4, q5
  166. fold64 q6, q7
  167. subs arg3, arg3, #128
  168. // check if there is another 64B in the buffer to be able to fold
  169. bge _fold_64_B_loop
  170. // at this point, the buffer pointer is pointing at the last y Bytes
  171. // of the buffer the 64B of folded data is in 4 of the vector
  172. // registers: v0, v1, v2, v3
  173. // fold the 8 vector registers to 1 vector register with different
  174. // constants
  175. adr ip, rk9
  176. vld1.64 {q10}, [ip, :128]!
  177. .macro fold16, reg, rk
  178. vmull.p64 q8, \reg\()l, d20
  179. vmull.p64 \reg, \reg\()h, d21
  180. .ifnb \rk
  181. vld1.64 {q10}, [ip, :128]!
  182. .endif
  183. veor.8 q7, q7, q8
  184. veor.8 q7, q7, \reg
  185. .endm
  186. fold16 q0, rk11
  187. fold16 q1, rk13
  188. fold16 q2, rk15
  189. fold16 q3, rk17
  190. fold16 q4, rk19
  191. fold16 q5, rk1
  192. fold16 q6
  193. // instead of 64, we add 48 to the loop counter to save 1 instruction
  194. // from the loop instead of a cmp instruction, we use the negative
  195. // flag with the jl instruction
  196. adds arg3, arg3, #(128-16)
  197. blt _final_reduction_for_128
  198. // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
  199. // and the rest is in memory. We can fold 16 bytes at a time if y>=16
  200. // continue folding 16B at a time
  201. _16B_reduction_loop:
  202. vmull.p64 q8, d14, d20
  203. vmull.p64 q7, d15, d21
  204. veor.8 q7, q7, q8
  205. vld1.64 {q0}, [arg2]!
  206. CPU_LE( vrev64.8 q0, q0 )
  207. vswp d0, d1
  208. veor.8 q7, q7, q0
  209. subs arg3, arg3, #16
  210. // instead of a cmp instruction, we utilize the flags with the
  211. // jge instruction equivalent of: cmp arg3, 16-16
  212. // check if there is any more 16B in the buffer to be able to fold
  213. bge _16B_reduction_loop
  214. // now we have 16+z bytes left to reduce, where 0<= z < 16.
  215. // first, we reduce the data in the xmm7 register
  216. _final_reduction_for_128:
  217. // check if any more data to fold. If not, compute the CRC of
  218. // the final 128 bits
  219. adds arg3, arg3, #16
  220. beq _128_done
  221. // here we are getting data that is less than 16 bytes.
  222. // since we know that there was data before the pointer, we can
  223. // offset the input pointer before the actual point, to receive
  224. // exactly 16 bytes. after that the registers need to be adjusted.
  225. _get_last_two_regs:
  226. add arg2, arg2, arg3
  227. sub arg2, arg2, #16
  228. vld1.64 {q1}, [arg2]
  229. CPU_LE( vrev64.8 q1, q1 )
  230. vswp d2, d3
  231. // get rid of the extra data that was loaded before
  232. // load the shift constant
  233. adr ip, tbl_shf_table + 16
  234. sub ip, ip, arg3
  235. vld1.8 {q0}, [ip]
  236. // shift v2 to the left by arg3 bytes
  237. vtbl.8 d4, {d14-d15}, d0
  238. vtbl.8 d5, {d14-d15}, d1
  239. // shift v7 to the right by 16-arg3 bytes
  240. vmov.i8 q9, #0x80
  241. veor.8 q0, q0, q9
  242. vtbl.8 d18, {d14-d15}, d0
  243. vtbl.8 d19, {d14-d15}, d1
  244. // blend
  245. vshr.s8 q0, q0, #7 // convert to 8-bit mask
  246. vbsl.8 q0, q2, q1
  247. // fold 16 Bytes
  248. vmull.p64 q8, d18, d20
  249. vmull.p64 q7, d19, d21
  250. veor.8 q7, q7, q8
  251. veor.8 q7, q7, q0
  252. _128_done:
  253. // compute crc of a 128-bit value
  254. vldr d20, rk5
  255. vldr d21, rk6 // rk5 and rk6 in xmm10
  256. // 64b fold
  257. vext.8 q0, qzr, q7, #8
  258. vmull.p64 q7, d15, d20
  259. veor.8 q7, q7, q0
  260. // 32b fold
  261. vext.8 q0, q7, qzr, #12
  262. vmov s31, s3
  263. vmull.p64 q0, d0, d21
  264. veor.8 q7, q0, q7
  265. // barrett reduction
  266. _barrett:
  267. vldr d20, rk7
  268. vldr d21, rk8
  269. vmull.p64 q0, d15, d20
  270. vext.8 q0, qzr, q0, #12
  271. vmull.p64 q0, d1, d21
  272. vext.8 q0, qzr, q0, #12
  273. veor.8 q7, q7, q0
  274. vmov r0, s29
  275. _cleanup:
  276. // scale the result back to 16 bits
  277. lsr r0, r0, #16
  278. bx lr
  279. _less_than_128:
  280. teq arg3, #0
  281. beq _cleanup
  282. vmov.i8 q0, #0
  283. vmov s3, arg1_low32 // get the initial crc value
  284. vld1.64 {q7}, [arg2]!
  285. CPU_LE( vrev64.8 q7, q7 )
  286. vswp d14, d15
  287. veor.8 q7, q7, q0
  288. cmp arg3, #16
  289. beq _128_done // exactly 16 left
  290. blt _less_than_16_left
  291. // now if there is, load the constants
  292. vldr d20, rk1
  293. vldr d21, rk2 // rk1 and rk2 in xmm10
  294. // check if there is enough buffer to be able to fold 16B at a time
  295. subs arg3, arg3, #32
  296. addlt arg3, arg3, #16
  297. blt _get_last_two_regs
  298. b _16B_reduction_loop
  299. _less_than_16_left:
  300. // shl r9, 4
  301. adr ip, tbl_shf_table + 16
  302. sub ip, ip, arg3
  303. vld1.8 {q0}, [ip]
  304. vmov.i8 q9, #0x80
  305. veor.8 q0, q0, q9
  306. vtbl.8 d18, {d14-d15}, d0
  307. vtbl.8 d15, {d14-d15}, d1
  308. vmov d14, d18
  309. b _128_done
  310. ENDPROC(crc_t10dif_pmull)
  311. // precomputed constants
  312. // these constants are precomputed from the poly:
  313. // 0x8bb70000 (0x8bb7 scaled to 32 bits)
  314. .align 4
  315. // Q = 0x18BB70000
  316. // rk1 = 2^(32*3) mod Q << 32
  317. // rk2 = 2^(32*5) mod Q << 32
  318. // rk3 = 2^(32*15) mod Q << 32
  319. // rk4 = 2^(32*17) mod Q << 32
  320. // rk5 = 2^(32*3) mod Q << 32
  321. // rk6 = 2^(32*2) mod Q << 32
  322. // rk7 = floor(2^64/Q)
  323. // rk8 = Q
  324. rk3: .quad 0x9d9d000000000000
  325. rk4: .quad 0x7cf5000000000000
  326. rk5: .quad 0x2d56000000000000
  327. rk6: .quad 0x1368000000000000
  328. rk7: .quad 0x00000001f65a57f8
  329. rk8: .quad 0x000000018bb70000
  330. rk9: .quad 0xceae000000000000
  331. rk10: .quad 0xbfd6000000000000
  332. rk11: .quad 0x1e16000000000000
  333. rk12: .quad 0x713c000000000000
  334. rk13: .quad 0xf7f9000000000000
  335. rk14: .quad 0x80a6000000000000
  336. rk15: .quad 0x044c000000000000
  337. rk16: .quad 0xe658000000000000
  338. rk17: .quad 0xad18000000000000
  339. rk18: .quad 0xa497000000000000
  340. rk19: .quad 0x6ee3000000000000
  341. rk20: .quad 0xe7b5000000000000
  342. rk1: .quad 0x2d56000000000000
  343. rk2: .quad 0x06df000000000000
  344. tbl_shf_table:
  345. // use these values for shift constants for the tbl/tbx instruction
  346. // different alignments result in values as shown:
  347. // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
  348. // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
  349. // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
  350. // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
  351. // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
  352. // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
  353. // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
  354. // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
  355. // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
  356. // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
  357. // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
  358. // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
  359. // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
  360. // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
  361. // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
  362. .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
  363. .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
  364. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  365. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0