memcpy_64.S 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /* Copyright 2002 Andi Kleen */
  3. #include <linux/export.h>
  4. #include <linux/linkage.h>
  5. #include <linux/cfi_types.h>
  6. #include <asm/errno.h>
  7. #include <asm/cpufeatures.h>
  8. #include <asm/alternative.h>
  9. .section .noinstr.text, "ax"
  10. /*
  11. * memcpy - Copy a memory block.
  12. *
  13. * Input:
  14. * rdi destination
  15. * rsi source
  16. * rdx count
  17. *
  18. * Output:
  19. * rax original destination
  20. *
  21. * The FSRM alternative should be done inline (avoiding the call and
  22. * the disgusting return handling), but that would require some help
  23. * from the compiler for better calling conventions.
  24. *
  25. * The 'rep movsb' itself is small enough to replace the call, but the
  26. * two register moves blow up the code. And one of them is "needed"
  27. * only for the return value that is the same as the source input,
  28. * which the compiler could/should do much better anyway.
  29. */
  30. SYM_TYPED_FUNC_START(__memcpy)
  31. ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
  32. movq %rdi, %rax
  33. movq %rdx, %rcx
  34. rep movsb
  35. RET
  36. SYM_FUNC_END(__memcpy)
  37. EXPORT_SYMBOL(__memcpy)
  38. SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
  39. EXPORT_SYMBOL(memcpy)
  40. SYM_FUNC_START_LOCAL(memcpy_orig)
  41. movq %rdi, %rax
  42. cmpq $0x20, %rdx
  43. jb .Lhandle_tail
  44. /*
  45. * We check whether memory false dependence could occur,
  46. * then jump to corresponding copy mode.
  47. */
  48. cmp %dil, %sil
  49. jl .Lcopy_backward
  50. subq $0x20, %rdx
  51. .Lcopy_forward_loop:
  52. subq $0x20, %rdx
  53. /*
  54. * Move in blocks of 4x8 bytes:
  55. */
  56. movq 0*8(%rsi), %r8
  57. movq 1*8(%rsi), %r9
  58. movq 2*8(%rsi), %r10
  59. movq 3*8(%rsi), %r11
  60. leaq 4*8(%rsi), %rsi
  61. movq %r8, 0*8(%rdi)
  62. movq %r9, 1*8(%rdi)
  63. movq %r10, 2*8(%rdi)
  64. movq %r11, 3*8(%rdi)
  65. leaq 4*8(%rdi), %rdi
  66. jae .Lcopy_forward_loop
  67. addl $0x20, %edx
  68. jmp .Lhandle_tail
  69. .Lcopy_backward:
  70. /*
  71. * Calculate copy position to tail.
  72. */
  73. addq %rdx, %rsi
  74. addq %rdx, %rdi
  75. subq $0x20, %rdx
  76. /*
  77. * At most 3 ALU operations in one cycle,
  78. * so append NOPS in the same 16 bytes trunk.
  79. */
  80. .p2align 4
  81. .Lcopy_backward_loop:
  82. subq $0x20, %rdx
  83. movq -1*8(%rsi), %r8
  84. movq -2*8(%rsi), %r9
  85. movq -3*8(%rsi), %r10
  86. movq -4*8(%rsi), %r11
  87. leaq -4*8(%rsi), %rsi
  88. movq %r8, -1*8(%rdi)
  89. movq %r9, -2*8(%rdi)
  90. movq %r10, -3*8(%rdi)
  91. movq %r11, -4*8(%rdi)
  92. leaq -4*8(%rdi), %rdi
  93. jae .Lcopy_backward_loop
  94. /*
  95. * Calculate copy position to head.
  96. */
  97. addl $0x20, %edx
  98. subq %rdx, %rsi
  99. subq %rdx, %rdi
  100. .Lhandle_tail:
  101. cmpl $16, %edx
  102. jb .Lless_16bytes
  103. /*
  104. * Move data from 16 bytes to 31 bytes.
  105. */
  106. movq 0*8(%rsi), %r8
  107. movq 1*8(%rsi), %r9
  108. movq -2*8(%rsi, %rdx), %r10
  109. movq -1*8(%rsi, %rdx), %r11
  110. movq %r8, 0*8(%rdi)
  111. movq %r9, 1*8(%rdi)
  112. movq %r10, -2*8(%rdi, %rdx)
  113. movq %r11, -1*8(%rdi, %rdx)
  114. RET
  115. .p2align 4
  116. .Lless_16bytes:
  117. cmpl $8, %edx
  118. jb .Lless_8bytes
  119. /*
  120. * Move data from 8 bytes to 15 bytes.
  121. */
  122. movq 0*8(%rsi), %r8
  123. movq -1*8(%rsi, %rdx), %r9
  124. movq %r8, 0*8(%rdi)
  125. movq %r9, -1*8(%rdi, %rdx)
  126. RET
  127. .p2align 4
  128. .Lless_8bytes:
  129. cmpl $4, %edx
  130. jb .Lless_3bytes
  131. /*
  132. * Move data from 4 bytes to 7 bytes.
  133. */
  134. movl (%rsi), %ecx
  135. movl -4(%rsi, %rdx), %r8d
  136. movl %ecx, (%rdi)
  137. movl %r8d, -4(%rdi, %rdx)
  138. RET
  139. .p2align 4
  140. .Lless_3bytes:
  141. subl $1, %edx
  142. jb .Lend
  143. /*
  144. * Move data from 1 bytes to 3 bytes.
  145. */
  146. movzbl (%rsi), %ecx
  147. jz .Lstore_1byte
  148. movzbq 1(%rsi), %r8
  149. movzbq (%rsi, %rdx), %r9
  150. movb %r8b, 1(%rdi)
  151. movb %r9b, (%rdi, %rdx)
  152. .Lstore_1byte:
  153. movb %cl, (%rdi)
  154. .Lend:
  155. RET
  156. SYM_FUNC_END(memcpy_orig)