memcpy_64.S 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/errno.h>
  4. #include <asm/cpufeatures.h>
  5. #include <asm/mcsafe_test.h>
  6. #include <asm/alternative-asm.h>
  7. #include <asm/export.h>
  8. /*
  9. * We build a jump to memcpy_orig by default which gets NOPped out on
  10. * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  11. * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  12. * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  13. */
  14. /*
  15. * memcpy - Copy a memory block.
  16. *
  17. * Input:
  18. * rdi destination
  19. * rsi source
  20. * rdx count
  21. *
  22. * Output:
  23. * rax original destination
  24. */
  25. ENTRY(__memcpy)
  26. .weak memcpy
  27. .p2align 4, 0x90
  28. memcpy:
  29. ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  30. "jmp memcpy_erms", X86_FEATURE_ERMS
  31. movq %rdi, %rax
  32. movq %rdx, %rcx
  33. shrq $3, %rcx
  34. andl $7, %edx
  35. rep movsq
  36. movl %edx, %ecx
  37. rep movsb
  38. ret
  39. ENDPROC(memcpy)
  40. ENDPROC(__memcpy)
  41. EXPORT_SYMBOL(memcpy)
  42. EXPORT_SYMBOL(__memcpy)
  43. /*
  44. * memcpy_erms() - enhanced fast string memcpy. This is faster and
  45. * simpler than memcpy. Use memcpy_erms when possible.
  46. */
  47. ENTRY(memcpy_erms)
  48. movq %rdi, %rax
  49. movq %rdx, %rcx
  50. rep movsb
  51. ret
  52. ENDPROC(memcpy_erms)
  53. ENTRY(memcpy_orig)
  54. movq %rdi, %rax
  55. cmpq $0x20, %rdx
  56. jb .Lhandle_tail
  57. /*
  58. * We check whether memory false dependence could occur,
  59. * then jump to corresponding copy mode.
  60. */
  61. cmp %dil, %sil
  62. jl .Lcopy_backward
  63. subq $0x20, %rdx
  64. .Lcopy_forward_loop:
  65. subq $0x20, %rdx
  66. /*
  67. * Move in blocks of 4x8 bytes:
  68. */
  69. movq 0*8(%rsi), %r8
  70. movq 1*8(%rsi), %r9
  71. movq 2*8(%rsi), %r10
  72. movq 3*8(%rsi), %r11
  73. leaq 4*8(%rsi), %rsi
  74. movq %r8, 0*8(%rdi)
  75. movq %r9, 1*8(%rdi)
  76. movq %r10, 2*8(%rdi)
  77. movq %r11, 3*8(%rdi)
  78. leaq 4*8(%rdi), %rdi
  79. jae .Lcopy_forward_loop
  80. addl $0x20, %edx
  81. jmp .Lhandle_tail
  82. .Lcopy_backward:
  83. /*
  84. * Calculate copy position to tail.
  85. */
  86. addq %rdx, %rsi
  87. addq %rdx, %rdi
  88. subq $0x20, %rdx
  89. /*
  90. * At most 3 ALU operations in one cycle,
  91. * so append NOPS in the same 16 bytes trunk.
  92. */
  93. .p2align 4
  94. .Lcopy_backward_loop:
  95. subq $0x20, %rdx
  96. movq -1*8(%rsi), %r8
  97. movq -2*8(%rsi), %r9
  98. movq -3*8(%rsi), %r10
  99. movq -4*8(%rsi), %r11
  100. leaq -4*8(%rsi), %rsi
  101. movq %r8, -1*8(%rdi)
  102. movq %r9, -2*8(%rdi)
  103. movq %r10, -3*8(%rdi)
  104. movq %r11, -4*8(%rdi)
  105. leaq -4*8(%rdi), %rdi
  106. jae .Lcopy_backward_loop
  107. /*
  108. * Calculate copy position to head.
  109. */
  110. addl $0x20, %edx
  111. subq %rdx, %rsi
  112. subq %rdx, %rdi
  113. .Lhandle_tail:
  114. cmpl $16, %edx
  115. jb .Lless_16bytes
  116. /*
  117. * Move data from 16 bytes to 31 bytes.
  118. */
  119. movq 0*8(%rsi), %r8
  120. movq 1*8(%rsi), %r9
  121. movq -2*8(%rsi, %rdx), %r10
  122. movq -1*8(%rsi, %rdx), %r11
  123. movq %r8, 0*8(%rdi)
  124. movq %r9, 1*8(%rdi)
  125. movq %r10, -2*8(%rdi, %rdx)
  126. movq %r11, -1*8(%rdi, %rdx)
  127. retq
  128. .p2align 4
  129. .Lless_16bytes:
  130. cmpl $8, %edx
  131. jb .Lless_8bytes
  132. /*
  133. * Move data from 8 bytes to 15 bytes.
  134. */
  135. movq 0*8(%rsi), %r8
  136. movq -1*8(%rsi, %rdx), %r9
  137. movq %r8, 0*8(%rdi)
  138. movq %r9, -1*8(%rdi, %rdx)
  139. retq
  140. .p2align 4
  141. .Lless_8bytes:
  142. cmpl $4, %edx
  143. jb .Lless_3bytes
  144. /*
  145. * Move data from 4 bytes to 7 bytes.
  146. */
  147. movl (%rsi), %ecx
  148. movl -4(%rsi, %rdx), %r8d
  149. movl %ecx, (%rdi)
  150. movl %r8d, -4(%rdi, %rdx)
  151. retq
  152. .p2align 4
  153. .Lless_3bytes:
  154. subl $1, %edx
  155. jb .Lend
  156. /*
  157. * Move data from 1 bytes to 3 bytes.
  158. */
  159. movzbl (%rsi), %ecx
  160. jz .Lstore_1byte
  161. movzbq 1(%rsi), %r8
  162. movzbq (%rsi, %rdx), %r9
  163. movb %r8b, 1(%rdi)
  164. movb %r9b, (%rdi, %rdx)
  165. .Lstore_1byte:
  166. movb %cl, (%rdi)
  167. .Lend:
  168. retq
  169. ENDPROC(memcpy_orig)
  170. #ifndef CONFIG_UML
  171. MCSAFE_TEST_CTL
  172. /*
  173. * __memcpy_mcsafe - memory copy with machine check exception handling
  174. * Note that we only catch machine checks when reading the source addresses.
  175. * Writes to target are posted and don't generate machine checks.
  176. */
  177. ENTRY(__memcpy_mcsafe)
  178. cmpl $8, %edx
  179. /* Less than 8 bytes? Go to byte copy loop */
  180. jb .L_no_whole_words
  181. /* Check for bad alignment of source */
  182. testl $7, %esi
  183. /* Already aligned */
  184. jz .L_8byte_aligned
  185. /* Copy one byte at a time until source is 8-byte aligned */
  186. movl %esi, %ecx
  187. andl $7, %ecx
  188. subl $8, %ecx
  189. negl %ecx
  190. subl %ecx, %edx
  191. .L_read_leading_bytes:
  192. movb (%rsi), %al
  193. MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
  194. MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
  195. .L_write_leading_bytes:
  196. movb %al, (%rdi)
  197. incq %rsi
  198. incq %rdi
  199. decl %ecx
  200. jnz .L_read_leading_bytes
  201. .L_8byte_aligned:
  202. movl %edx, %ecx
  203. andl $7, %edx
  204. shrl $3, %ecx
  205. jz .L_no_whole_words
  206. .L_read_words:
  207. movq (%rsi), %r8
  208. MCSAFE_TEST_SRC %rsi 8 .E_read_words
  209. MCSAFE_TEST_DST %rdi 8 .E_write_words
  210. .L_write_words:
  211. movq %r8, (%rdi)
  212. addq $8, %rsi
  213. addq $8, %rdi
  214. decl %ecx
  215. jnz .L_read_words
  216. /* Any trailing bytes? */
  217. .L_no_whole_words:
  218. andl %edx, %edx
  219. jz .L_done_memcpy_trap
  220. /* Copy trailing bytes */
  221. movl %edx, %ecx
  222. .L_read_trailing_bytes:
  223. movb (%rsi), %al
  224. MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
  225. MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
  226. .L_write_trailing_bytes:
  227. movb %al, (%rdi)
  228. incq %rsi
  229. incq %rdi
  230. decl %ecx
  231. jnz .L_read_trailing_bytes
  232. /* Copy successful. Return zero */
  233. .L_done_memcpy_trap:
  234. xorl %eax, %eax
  235. .L_done:
  236. ret
  237. ENDPROC(__memcpy_mcsafe)
  238. EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
  239. .section .fixup, "ax"
  240. /*
  241. * Return number of bytes not copied for any failure. Note that
  242. * there is no "tail" handling since the source buffer is 8-byte
  243. * aligned and poison is cacheline aligned.
  244. */
  245. .E_read_words:
  246. shll $3, %ecx
  247. .E_leading_bytes:
  248. addl %edx, %ecx
  249. .E_trailing_bytes:
  250. mov %ecx, %eax
  251. jmp .L_done
  252. /*
  253. * For write fault handling, given the destination is unaligned,
  254. * we handle faults on multi-byte writes with a byte-by-byte
  255. * copy up to the write-protected page.
  256. */
  257. .E_write_words:
  258. shll $3, %ecx
  259. addl %edx, %ecx
  260. movl %ecx, %edx
  261. jmp mcsafe_handle_tail
  262. .previous
  263. _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
  264. _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
  265. _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
  266. _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
  267. _ASM_EXTABLE(.L_write_words, .E_write_words)
  268. _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
  269. #endif