memset_64.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /* Copyright 2002 Andi Kleen, SuSE Labs */
  3. #include <linux/linkage.h>
  4. #include <asm/cpufeatures.h>
  5. #include <asm/alternative-asm.h>
  6. #include <asm/export.h>
  7. /*
  8. * ISO C memset - set a memory block to a byte value. This function uses fast
  9. * string to get better performance than the original function. The code is
  10. * simpler and shorter than the original function as well.
  11. *
  12. * rdi destination
  13. * rsi value (char)
  14. * rdx count (bytes)
  15. *
  16. * rax original destination
  17. */
  18. .weak memset
  19. .p2align 4, 0x90
  20. memset:
  21. ENTRY(__memset)
  22. /*
  23. * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
  24. * to use it when possible. If not available, use fast string instructions.
  25. *
  26. * Otherwise, use original memset function.
  27. */
  28. ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
  29. "jmp memset_erms", X86_FEATURE_ERMS
  30. movq %rdi,%r9
  31. movq %rdx,%rcx
  32. andl $7,%edx
  33. shrq $3,%rcx
  34. /* expand byte value */
  35. movzbl %sil,%esi
  36. movabs $0x0101010101010101,%rax
  37. imulq %rsi,%rax
  38. rep stosq
  39. movl %edx,%ecx
  40. rep stosb
  41. movq %r9,%rax
  42. ret
  43. ENDPROC(memset)
  44. ENDPROC(__memset)
  45. EXPORT_SYMBOL(memset)
  46. EXPORT_SYMBOL(__memset)
  47. /*
  48. * ISO C memset - set a memory block to a byte value. This function uses
  49. * enhanced rep stosb to override the fast string function.
  50. * The code is simpler and shorter than the fast string function as well.
  51. *
  52. * rdi destination
  53. * rsi value (char)
  54. * rdx count (bytes)
  55. *
  56. * rax original destination
  57. */
  58. ENTRY(memset_erms)
  59. movq %rdi,%r9
  60. movb %sil,%al
  61. movq %rdx,%rcx
  62. rep stosb
  63. movq %r9,%rax
  64. ret
  65. ENDPROC(memset_erms)
  66. ENTRY(memset_orig)
  67. movq %rdi,%r10
  68. /* expand byte value */
  69. movzbl %sil,%ecx
  70. movabs $0x0101010101010101,%rax
  71. imulq %rcx,%rax
  72. /* align dst */
  73. movl %edi,%r9d
  74. andl $7,%r9d
  75. jnz .Lbad_alignment
  76. .Lafter_bad_alignment:
  77. movq %rdx,%rcx
  78. shrq $6,%rcx
  79. jz .Lhandle_tail
  80. .p2align 4
  81. .Lloop_64:
  82. decq %rcx
  83. movq %rax,(%rdi)
  84. movq %rax,8(%rdi)
  85. movq %rax,16(%rdi)
  86. movq %rax,24(%rdi)
  87. movq %rax,32(%rdi)
  88. movq %rax,40(%rdi)
  89. movq %rax,48(%rdi)
  90. movq %rax,56(%rdi)
  91. leaq 64(%rdi),%rdi
  92. jnz .Lloop_64
  93. /* Handle tail in loops. The loops should be faster than hard
  94. to predict jump tables. */
  95. .p2align 4
  96. .Lhandle_tail:
  97. movl %edx,%ecx
  98. andl $63&(~7),%ecx
  99. jz .Lhandle_7
  100. shrl $3,%ecx
  101. .p2align 4
  102. .Lloop_8:
  103. decl %ecx
  104. movq %rax,(%rdi)
  105. leaq 8(%rdi),%rdi
  106. jnz .Lloop_8
  107. .Lhandle_7:
  108. andl $7,%edx
  109. jz .Lende
  110. .p2align 4
  111. .Lloop_1:
  112. decl %edx
  113. movb %al,(%rdi)
  114. leaq 1(%rdi),%rdi
  115. jnz .Lloop_1
  116. .Lende:
  117. movq %r10,%rax
  118. ret
  119. .Lbad_alignment:
  120. cmpq $7,%rdx
  121. jbe .Lhandle_7
  122. movq %rax,(%rdi) /* unaligned store */
  123. movq $8,%r8
  124. subq %r9,%r8
  125. addq %r8,%rdi
  126. subq %r8,%rdx
  127. jmp .Lafter_bad_alignment
  128. .Lfinal:
  129. ENDPROC(memset_orig)