memcpy.S 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  4. */
  5. #include <linux/export.h>
  6. #include <asm/alternative-asm.h>
  7. #include <asm/asm.h>
  8. #include <asm/asmmacro.h>
  9. #include <asm/cpu.h>
  10. #include <asm/regdef.h>
  11. #include <asm/unwind_hints.h>
  12. .section .noinstr.text, "ax"
  13. SYM_FUNC_START(memcpy)
  14. /*
  15. * Some CPUs support hardware unaligned access
  16. */
  17. ALTERNATIVE "b __memcpy_generic", \
  18. "b __memcpy_fast", CPU_FEATURE_UAL
  19. SYM_FUNC_END(memcpy)
  20. SYM_FUNC_ALIAS(__memcpy, memcpy)
  21. EXPORT_SYMBOL(memcpy)
  22. EXPORT_SYMBOL(__memcpy)
  23. _ASM_NOKPROBE(memcpy)
  24. _ASM_NOKPROBE(__memcpy)
  25. /*
  26. * void *__memcpy_generic(void *dst, const void *src, size_t n)
  27. *
  28. * a0: dst
  29. * a1: src
  30. * a2: n
  31. */
  32. SYM_FUNC_START(__memcpy_generic)
  33. move a3, a0
  34. beqz a2, 2f
  35. 1: ld.b t0, a1, 0
  36. st.b t0, a0, 0
  37. addi.d a0, a0, 1
  38. addi.d a1, a1, 1
  39. addi.d a2, a2, -1
  40. bgt a2, zero, 1b
  41. 2: move a0, a3
  42. jr ra
  43. SYM_FUNC_END(__memcpy_generic)
  44. _ASM_NOKPROBE(__memcpy_generic)
  45. .align 5
  46. SYM_FUNC_START_NOALIGN(__memcpy_small)
  47. pcaddi t0, 8
  48. slli.d a2, a2, 5
  49. add.d t0, t0, a2
  50. jr t0
  51. .align 5
  52. 0: jr ra
  53. .align 5
  54. 1: ld.b t0, a1, 0
  55. st.b t0, a0, 0
  56. jr ra
  57. .align 5
  58. 2: ld.h t0, a1, 0
  59. st.h t0, a0, 0
  60. jr ra
  61. .align 5
  62. 3: ld.h t0, a1, 0
  63. ld.b t1, a1, 2
  64. st.h t0, a0, 0
  65. st.b t1, a0, 2
  66. jr ra
  67. .align 5
  68. 4: ld.w t0, a1, 0
  69. st.w t0, a0, 0
  70. jr ra
  71. .align 5
  72. 5: ld.w t0, a1, 0
  73. ld.b t1, a1, 4
  74. st.w t0, a0, 0
  75. st.b t1, a0, 4
  76. jr ra
  77. .align 5
  78. 6: ld.w t0, a1, 0
  79. ld.h t1, a1, 4
  80. st.w t0, a0, 0
  81. st.h t1, a0, 4
  82. jr ra
  83. .align 5
  84. 7: ld.w t0, a1, 0
  85. ld.w t1, a1, 3
  86. st.w t0, a0, 0
  87. st.w t1, a0, 3
  88. jr ra
  89. .align 5
  90. 8: ld.d t0, a1, 0
  91. st.d t0, a0, 0
  92. jr ra
  93. SYM_FUNC_END(__memcpy_small)
  94. _ASM_NOKPROBE(__memcpy_small)
  95. /*
  96. * void *__memcpy_fast(void *dst, const void *src, size_t n)
  97. *
  98. * a0: dst
  99. * a1: src
  100. * a2: n
  101. */
  102. SYM_FUNC_START(__memcpy_fast)
  103. sltui t0, a2, 9
  104. bnez t0, __memcpy_small
  105. add.d a3, a1, a2
  106. add.d a2, a0, a2
  107. ld.d a6, a1, 0
  108. ld.d a7, a3, -8
  109. /* align up destination address */
  110. andi t1, a0, 7
  111. sub.d t0, zero, t1
  112. addi.d t0, t0, 8
  113. add.d a1, a1, t0
  114. add.d a5, a0, t0
  115. addi.d a4, a3, -64
  116. bgeu a1, a4, .Llt64
  117. /* copy 64 bytes at a time */
  118. .Lloop64:
  119. ld.d t0, a1, 0
  120. ld.d t1, a1, 8
  121. ld.d t2, a1, 16
  122. ld.d t3, a1, 24
  123. ld.d t4, a1, 32
  124. ld.d t5, a1, 40
  125. ld.d t6, a1, 48
  126. ld.d t7, a1, 56
  127. addi.d a1, a1, 64
  128. st.d t0, a5, 0
  129. st.d t1, a5, 8
  130. st.d t2, a5, 16
  131. st.d t3, a5, 24
  132. st.d t4, a5, 32
  133. st.d t5, a5, 40
  134. st.d t6, a5, 48
  135. st.d t7, a5, 56
  136. addi.d a5, a5, 64
  137. bltu a1, a4, .Lloop64
  138. /* copy the remaining bytes */
  139. .Llt64:
  140. addi.d a4, a3, -32
  141. bgeu a1, a4, .Llt32
  142. ld.d t0, a1, 0
  143. ld.d t1, a1, 8
  144. ld.d t2, a1, 16
  145. ld.d t3, a1, 24
  146. addi.d a1, a1, 32
  147. st.d t0, a5, 0
  148. st.d t1, a5, 8
  149. st.d t2, a5, 16
  150. st.d t3, a5, 24
  151. addi.d a5, a5, 32
  152. .Llt32:
  153. addi.d a4, a3, -16
  154. bgeu a1, a4, .Llt16
  155. ld.d t0, a1, 0
  156. ld.d t1, a1, 8
  157. addi.d a1, a1, 16
  158. st.d t0, a5, 0
  159. st.d t1, a5, 8
  160. addi.d a5, a5, 16
  161. .Llt16:
  162. addi.d a4, a3, -8
  163. bgeu a1, a4, .Llt8
  164. ld.d t0, a1, 0
  165. st.d t0, a5, 0
  166. .Llt8:
  167. st.d a6, a0, 0
  168. st.d a7, a2, -8
  169. /* return */
  170. jr ra
  171. SYM_FUNC_END(__memcpy_fast)
  172. _ASM_NOKPROBE(__memcpy_fast)
  173. STACK_FRAME_NON_STANDARD __memcpy_small