nh-sse2-x86_64.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
  4. *
  5. * Copyright 2018 Google LLC
  6. *
  7. * Author: Eric Biggers <ebiggers@google.com>
  8. */
  9. #include <linux/linkage.h>
  10. #include <linux/cfi_types.h>
  11. #define PASS0_SUMS %xmm0
  12. #define PASS1_SUMS %xmm1
  13. #define PASS2_SUMS %xmm2
  14. #define PASS3_SUMS %xmm3
  15. #define K0 %xmm4
  16. #define K1 %xmm5
  17. #define K2 %xmm6
  18. #define K3 %xmm7
  19. #define T0 %xmm8
  20. #define T1 %xmm9
  21. #define T2 %xmm10
  22. #define T3 %xmm11
  23. #define T4 %xmm12
  24. #define T5 %xmm13
  25. #define T6 %xmm14
  26. #define T7 %xmm15
  27. #define KEY %rdi
  28. #define MESSAGE %rsi
  29. #define MESSAGE_LEN %rdx
  30. #define HASH %rcx
  31. .macro _nh_stride k0, k1, k2, k3, offset
  32. // Load next message stride
  33. movdqu \offset(MESSAGE), T1
  34. // Load next key stride
  35. movdqu \offset(KEY), \k3
  36. // Add message words to key words
  37. movdqa T1, T2
  38. movdqa T1, T3
  39. paddd T1, \k0 // reuse k0 to avoid a move
  40. paddd \k1, T1
  41. paddd \k2, T2
  42. paddd \k3, T3
  43. // Multiply 32x32 => 64 and accumulate
  44. pshufd $0x10, \k0, T4
  45. pshufd $0x32, \k0, \k0
  46. pshufd $0x10, T1, T5
  47. pshufd $0x32, T1, T1
  48. pshufd $0x10, T2, T6
  49. pshufd $0x32, T2, T2
  50. pshufd $0x10, T3, T7
  51. pshufd $0x32, T3, T3
  52. pmuludq T4, \k0
  53. pmuludq T5, T1
  54. pmuludq T6, T2
  55. pmuludq T7, T3
  56. paddq \k0, PASS0_SUMS
  57. paddq T1, PASS1_SUMS
  58. paddq T2, PASS2_SUMS
  59. paddq T3, PASS3_SUMS
  60. .endm
  61. /*
  62. * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
  63. * __le64 hash[NH_NUM_PASSES])
  64. *
  65. * It's guaranteed that message_len % 16 == 0.
  66. */
  67. SYM_TYPED_FUNC_START(nh_sse2)
  68. movdqu 0x00(KEY), K0
  69. movdqu 0x10(KEY), K1
  70. movdqu 0x20(KEY), K2
  71. add $0x30, KEY
  72. pxor PASS0_SUMS, PASS0_SUMS
  73. pxor PASS1_SUMS, PASS1_SUMS
  74. pxor PASS2_SUMS, PASS2_SUMS
  75. pxor PASS3_SUMS, PASS3_SUMS
  76. sub $0x40, MESSAGE_LEN
  77. jl .Lloop4_done
  78. .Lloop4:
  79. _nh_stride K0, K1, K2, K3, 0x00
  80. _nh_stride K1, K2, K3, K0, 0x10
  81. _nh_stride K2, K3, K0, K1, 0x20
  82. _nh_stride K3, K0, K1, K2, 0x30
  83. add $0x40, KEY
  84. add $0x40, MESSAGE
  85. sub $0x40, MESSAGE_LEN
  86. jge .Lloop4
  87. .Lloop4_done:
  88. and $0x3f, MESSAGE_LEN
  89. jz .Ldone
  90. _nh_stride K0, K1, K2, K3, 0x00
  91. sub $0x10, MESSAGE_LEN
  92. jz .Ldone
  93. _nh_stride K1, K2, K3, K0, 0x10
  94. sub $0x10, MESSAGE_LEN
  95. jz .Ldone
  96. _nh_stride K2, K3, K0, K1, 0x20
  97. .Ldone:
  98. // Sum the accumulators for each pass, then store the sums to 'hash'
  99. movdqa PASS0_SUMS, T0
  100. movdqa PASS2_SUMS, T1
  101. punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
  102. punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
  103. punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
  104. punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
  105. paddq PASS0_SUMS, T0
  106. paddq PASS2_SUMS, T1
  107. movdqu T0, 0x00(HASH)
  108. movdqu T1, 0x10(HASH)
  109. RET
  110. SYM_FUNC_END(nh_sse2)