memmove_64.S 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Normally compiler builtins are used, but sometimes the compiler calls out
  4. * of line code. Based on asm-i386/string.h.
  5. *
  6. * This assembly file is re-written from memmove_64.c file.
  7. * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  8. */
  9. #include <linux/export.h>
  10. #include <linux/linkage.h>
  11. #include <asm/cpufeatures.h>
  12. #include <asm/alternative.h>
  13. #undef memmove
  14. .section .noinstr.text, "ax"
  15. /*
  16. * Implement memmove(). This can handle overlap between src and dst.
  17. *
  18. * Input:
  19. * rdi: dest
  20. * rsi: src
  21. * rdx: count
  22. *
  23. * Output:
  24. * rax: dest
  25. */
  26. SYM_FUNC_START(__memmove)
  27. mov %rdi, %rax
  28. /* Decide forward/backward copy mode */
  29. cmp %rdi, %rsi
  30. jge .Lmemmove_begin_forward
  31. mov %rsi, %r8
  32. add %rdx, %r8
  33. cmp %rdi, %r8
  34. jg 2f
  35. #define CHECK_LEN cmp $0x20, %rdx; jb 1f
  36. #define MEMMOVE_BYTES movq %rdx, %rcx; rep movsb; RET
  37. .Lmemmove_begin_forward:
  38. ALTERNATIVE_2 __stringify(CHECK_LEN), \
  39. __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \
  40. __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM
  41. /*
  42. * movsq instruction have many startup latency
  43. * so we handle small size by general register.
  44. */
  45. cmp $680, %rdx
  46. jb 3f
  47. /*
  48. * movsq instruction is only good for aligned case.
  49. */
  50. cmpb %dil, %sil
  51. je 4f
  52. 3:
  53. sub $0x20, %rdx
  54. /*
  55. * We gobble 32 bytes forward in each loop.
  56. */
  57. 5:
  58. sub $0x20, %rdx
  59. movq 0*8(%rsi), %r11
  60. movq 1*8(%rsi), %r10
  61. movq 2*8(%rsi), %r9
  62. movq 3*8(%rsi), %r8
  63. leaq 4*8(%rsi), %rsi
  64. movq %r11, 0*8(%rdi)
  65. movq %r10, 1*8(%rdi)
  66. movq %r9, 2*8(%rdi)
  67. movq %r8, 3*8(%rdi)
  68. leaq 4*8(%rdi), %rdi
  69. jae 5b
  70. addq $0x20, %rdx
  71. jmp 1f
  72. /*
  73. * Handle data forward by movsq.
  74. */
  75. .p2align 4
  76. 4:
  77. movq %rdx, %rcx
  78. movq -8(%rsi, %rdx), %r11
  79. lea -8(%rdi, %rdx), %r10
  80. shrq $3, %rcx
  81. rep movsq
  82. movq %r11, (%r10)
  83. jmp 13f
  84. .Lmemmove_end_forward:
  85. /*
  86. * Handle data backward by movsq.
  87. */
  88. .p2align 4
  89. 7:
  90. movq %rdx, %rcx
  91. movq (%rsi), %r11
  92. movq %rdi, %r10
  93. leaq -8(%rsi, %rdx), %rsi
  94. leaq -8(%rdi, %rdx), %rdi
  95. shrq $3, %rcx
  96. std
  97. rep movsq
  98. cld
  99. movq %r11, (%r10)
  100. jmp 13f
  101. /*
  102. * Start to prepare for backward copy.
  103. */
  104. .p2align 4
  105. 2:
  106. cmp $0x20, %rdx
  107. jb 1f
  108. cmp $680, %rdx
  109. jb 6f
  110. cmp %dil, %sil
  111. je 7b
  112. 6:
  113. /*
  114. * Calculate copy position to tail.
  115. */
  116. addq %rdx, %rsi
  117. addq %rdx, %rdi
  118. subq $0x20, %rdx
  119. /*
  120. * We gobble 32 bytes backward in each loop.
  121. */
  122. 8:
  123. subq $0x20, %rdx
  124. movq -1*8(%rsi), %r11
  125. movq -2*8(%rsi), %r10
  126. movq -3*8(%rsi), %r9
  127. movq -4*8(%rsi), %r8
  128. leaq -4*8(%rsi), %rsi
  129. movq %r11, -1*8(%rdi)
  130. movq %r10, -2*8(%rdi)
  131. movq %r9, -3*8(%rdi)
  132. movq %r8, -4*8(%rdi)
  133. leaq -4*8(%rdi), %rdi
  134. jae 8b
  135. /*
  136. * Calculate copy position to head.
  137. */
  138. addq $0x20, %rdx
  139. subq %rdx, %rsi
  140. subq %rdx, %rdi
  141. 1:
  142. cmpq $16, %rdx
  143. jb 9f
  144. /*
  145. * Move data from 16 bytes to 31 bytes.
  146. */
  147. movq 0*8(%rsi), %r11
  148. movq 1*8(%rsi), %r10
  149. movq -2*8(%rsi, %rdx), %r9
  150. movq -1*8(%rsi, %rdx), %r8
  151. movq %r11, 0*8(%rdi)
  152. movq %r10, 1*8(%rdi)
  153. movq %r9, -2*8(%rdi, %rdx)
  154. movq %r8, -1*8(%rdi, %rdx)
  155. jmp 13f
  156. .p2align 4
  157. 9:
  158. cmpq $8, %rdx
  159. jb 10f
  160. /*
  161. * Move data from 8 bytes to 15 bytes.
  162. */
  163. movq 0*8(%rsi), %r11
  164. movq -1*8(%rsi, %rdx), %r10
  165. movq %r11, 0*8(%rdi)
  166. movq %r10, -1*8(%rdi, %rdx)
  167. jmp 13f
  168. 10:
  169. cmpq $4, %rdx
  170. jb 11f
  171. /*
  172. * Move data from 4 bytes to 7 bytes.
  173. */
  174. movl (%rsi), %r11d
  175. movl -4(%rsi, %rdx), %r10d
  176. movl %r11d, (%rdi)
  177. movl %r10d, -4(%rdi, %rdx)
  178. jmp 13f
  179. 11:
  180. cmp $2, %rdx
  181. jb 12f
  182. /*
  183. * Move data from 2 bytes to 3 bytes.
  184. */
  185. movw (%rsi), %r11w
  186. movw -2(%rsi, %rdx), %r10w
  187. movw %r11w, (%rdi)
  188. movw %r10w, -2(%rdi, %rdx)
  189. jmp 13f
  190. 12:
  191. cmp $1, %rdx
  192. jb 13f
  193. /*
  194. * Move data for 1 byte.
  195. */
  196. movb (%rsi), %r11b
  197. movb %r11b, (%rdi)
  198. 13:
  199. RET
  200. SYM_FUNC_END(__memmove)
  201. EXPORT_SYMBOL(__memmove)
  202. SYM_FUNC_ALIAS_MEMFUNC(memmove, __memmove)
  203. EXPORT_SYMBOL(memmove)