recov_neon_inner.c 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. /*
  2. * Copyright (C) 2012 Intel Corporation
  3. * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License
  7. * as published by the Free Software Foundation; version 2
  8. * of the License.
  9. */
  10. #include <arm_neon.h>
  11. #ifdef CONFIG_ARM
  12. /*
  13. * AArch32 does not provide this intrinsic natively because it does not
  14. * implement the underlying instruction. AArch32 only provides a 64-bit
  15. * wide vtbl.8 instruction, so use that instead.
  16. */
  17. static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  18. {
  19. union {
  20. uint8x16_t val;
  21. uint8x8x2_t pair;
  22. } __a = { a };
  23. return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
  24. vtbl2_u8(__a.pair, vget_high_u8(b)));
  25. }
  26. #endif
  27. void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
  28. uint8_t *dq, const uint8_t *pbmul,
  29. const uint8_t *qmul)
  30. {
  31. uint8x16_t pm0 = vld1q_u8(pbmul);
  32. uint8x16_t pm1 = vld1q_u8(pbmul + 16);
  33. uint8x16_t qm0 = vld1q_u8(qmul);
  34. uint8x16_t qm1 = vld1q_u8(qmul + 16);
  35. uint8x16_t x0f = vdupq_n_u8(0x0f);
  36. /*
  37. * while ( bytes-- ) {
  38. * uint8_t px, qx, db;
  39. *
  40. * px = *p ^ *dp;
  41. * qx = qmul[*q ^ *dq];
  42. * *dq++ = db = pbmul[px] ^ qx;
  43. * *dp++ = db ^ px;
  44. * p++; q++;
  45. * }
  46. */
  47. while (bytes) {
  48. uint8x16_t vx, vy, px, qx, db;
  49. px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
  50. vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
  51. vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
  52. vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
  53. vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
  54. qx = veorq_u8(vx, vy);
  55. vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
  56. vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
  57. vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
  58. vx = veorq_u8(vx, vy);
  59. db = veorq_u8(vx, qx);
  60. vst1q_u8(dq, db);
  61. vst1q_u8(dp, veorq_u8(db, px));
  62. bytes -= 16;
  63. p += 16;
  64. q += 16;
  65. dp += 16;
  66. dq += 16;
  67. }
  68. }
  69. void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
  70. const uint8_t *qmul)
  71. {
  72. uint8x16_t qm0 = vld1q_u8(qmul);
  73. uint8x16_t qm1 = vld1q_u8(qmul + 16);
  74. uint8x16_t x0f = vdupq_n_u8(0x0f);
  75. /*
  76. * while (bytes--) {
  77. * *p++ ^= *dq = qmul[*q ^ *dq];
  78. * q++; dq++;
  79. * }
  80. */
  81. while (bytes) {
  82. uint8x16_t vx, vy;
  83. vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
  84. vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
  85. vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
  86. vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
  87. vx = veorq_u8(vx, vy);
  88. vy = veorq_u8(vx, vld1q_u8(p));
  89. vst1q_u8(dq, vx);
  90. vst1q_u8(p, vy);
  91. bytes -= 16;
  92. p += 16;
  93. q += 16;
  94. dq += 16;
  95. }
  96. }