nh-neon-core.S 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
  4. *
  5. * Copyright 2018 Google LLC
  6. *
  7. * Author: Eric Biggers <ebiggers@google.com>
  8. */
  9. #include <linux/linkage.h>
  10. #include <linux/cfi_types.h>
  11. KEY .req x0
  12. MESSAGE .req x1
  13. MESSAGE_LEN .req x2
  14. HASH .req x3
  15. PASS0_SUMS .req v0
  16. PASS1_SUMS .req v1
  17. PASS2_SUMS .req v2
  18. PASS3_SUMS .req v3
  19. K0 .req v4
  20. K1 .req v5
  21. K2 .req v6
  22. K3 .req v7
  23. T0 .req v8
  24. T1 .req v9
  25. T2 .req v10
  26. T3 .req v11
  27. T4 .req v12
  28. T5 .req v13
  29. T6 .req v14
  30. T7 .req v15
  31. .macro _nh_stride k0, k1, k2, k3
  32. // Load next message stride
  33. ld1 {T3.16b}, [MESSAGE], #16
  34. // Load next key stride
  35. ld1 {\k3\().4s}, [KEY], #16
  36. // Add message words to key words
  37. add T0.4s, T3.4s, \k0\().4s
  38. add T1.4s, T3.4s, \k1\().4s
  39. add T2.4s, T3.4s, \k2\().4s
  40. add T3.4s, T3.4s, \k3\().4s
  41. // Multiply 32x32 => 64 and accumulate
  42. mov T4.d[0], T0.d[1]
  43. mov T5.d[0], T1.d[1]
  44. mov T6.d[0], T2.d[1]
  45. mov T7.d[0], T3.d[1]
  46. umlal PASS0_SUMS.2d, T0.2s, T4.2s
  47. umlal PASS1_SUMS.2d, T1.2s, T5.2s
  48. umlal PASS2_SUMS.2d, T2.2s, T6.2s
  49. umlal PASS3_SUMS.2d, T3.2s, T7.2s
  50. .endm
  51. /*
  52. * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
  53. * __le64 hash[NH_NUM_PASSES])
  54. *
  55. * It's guaranteed that message_len % 16 == 0.
  56. */
  57. SYM_TYPED_FUNC_START(nh_neon)
  58. ld1 {K0.4s,K1.4s}, [KEY], #32
  59. movi PASS0_SUMS.2d, #0
  60. movi PASS1_SUMS.2d, #0
  61. ld1 {K2.4s}, [KEY], #16
  62. movi PASS2_SUMS.2d, #0
  63. movi PASS3_SUMS.2d, #0
  64. subs MESSAGE_LEN, MESSAGE_LEN, #64
  65. blt .Lloop4_done
  66. .Lloop4:
  67. _nh_stride K0, K1, K2, K3
  68. _nh_stride K1, K2, K3, K0
  69. _nh_stride K2, K3, K0, K1
  70. _nh_stride K3, K0, K1, K2
  71. subs MESSAGE_LEN, MESSAGE_LEN, #64
  72. bge .Lloop4
  73. .Lloop4_done:
  74. ands MESSAGE_LEN, MESSAGE_LEN, #63
  75. beq .Ldone
  76. _nh_stride K0, K1, K2, K3
  77. subs MESSAGE_LEN, MESSAGE_LEN, #16
  78. beq .Ldone
  79. _nh_stride K1, K2, K3, K0
  80. subs MESSAGE_LEN, MESSAGE_LEN, #16
  81. beq .Ldone
  82. _nh_stride K2, K3, K0, K1
  83. .Ldone:
  84. // Sum the accumulators for each pass, then store the sums to 'hash'
  85. addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
  86. addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
  87. st1 {T0.16b,T1.16b}, [HASH]
  88. ret
  89. SYM_FUNC_END(nh_neon)