memset.S 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /*
  2. * Copyright (C) 2013 ARM Ltd.
  3. * Copyright (C) 2013 Linaro.
  4. *
  5. * This code is based on glibc cortex strings work originally authored by Linaro
  6. * and re-licensed under GPLv2 for the Linux kernel. The original code can
  7. * be found @
  8. *
  9. * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10. * files/head:/src/aarch64/
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License version 2 as
  14. * published by the Free Software Foundation.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. */
  24. #include <linux/linkage.h>
  25. #include <asm/assembler.h>
  26. #include <asm/cache.h>
  27. /*
  28. * Fill in the buffer with character c (alignment handled by the hardware)
  29. *
  30. * Parameters:
  31. * x0 - buf
  32. * x1 - c
  33. * x2 - n
  34. * Returns:
  35. * x0 - buf
  36. */
  37. dstin .req x0
  38. val .req w1
  39. count .req x2
  40. tmp1 .req x3
  41. tmp1w .req w3
  42. tmp2 .req x4
  43. tmp2w .req w4
  44. zva_len_x .req x5
  45. zva_len .req w5
  46. zva_bits_x .req x6
  47. A_l .req x7
  48. A_lw .req w7
  49. dst .req x8
  50. tmp3w .req w9
  51. tmp3 .req x9
  52. ENTRY(__memset)
  53. WEAK(memset)
  54. mov dst, dstin /* Preserve return value. */
  55. and A_lw, val, #255
  56. orr A_lw, A_lw, A_lw, lsl #8
  57. orr A_lw, A_lw, A_lw, lsl #16
  58. orr A_l, A_l, A_l, lsl #32
  59. cmp count, #15
  60. b.hi .Lover16_proc
  61. /*All store maybe are non-aligned..*/
  62. tbz count, #3, 1f
  63. str A_l, [dst], #8
  64. 1:
  65. tbz count, #2, 2f
  66. str A_lw, [dst], #4
  67. 2:
  68. tbz count, #1, 3f
  69. strh A_lw, [dst], #2
  70. 3:
  71. tbz count, #0, 4f
  72. strb A_lw, [dst]
  73. 4:
  74. ret
  75. .Lover16_proc:
  76. /*Whether the start address is aligned with 16.*/
  77. neg tmp2, dst
  78. ands tmp2, tmp2, #15
  79. b.eq .Laligned
  80. /*
  81. * The count is not less than 16, we can use stp to store the start 16 bytes,
  82. * then adjust the dst aligned with 16.This process will make the current
  83. * memory address at alignment boundary.
  84. */
  85. stp A_l, A_l, [dst] /*non-aligned store..*/
  86. /*make the dst aligned..*/
  87. sub count, count, tmp2
  88. add dst, dst, tmp2
  89. .Laligned:
  90. cbz A_l, .Lzero_mem
  91. .Ltail_maybe_long:
  92. cmp count, #64
  93. b.ge .Lnot_short
  94. .Ltail63:
  95. ands tmp1, count, #0x30
  96. b.eq 3f
  97. cmp tmp1w, #0x20
  98. b.eq 1f
  99. b.lt 2f
  100. stp A_l, A_l, [dst], #16
  101. 1:
  102. stp A_l, A_l, [dst], #16
  103. 2:
  104. stp A_l, A_l, [dst], #16
  105. /*
  106. * The last store length is less than 16,use stp to write last 16 bytes.
  107. * It will lead some bytes written twice and the access is non-aligned.
  108. */
  109. 3:
  110. ands count, count, #15
  111. cbz count, 4f
  112. add dst, dst, count
  113. stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
  114. 4:
  115. ret
  116. /*
  117. * Critical loop. Start at a new cache line boundary. Assuming
  118. * 64 bytes per line, this ensures the entire loop is in one line.
  119. */
  120. .p2align L1_CACHE_SHIFT
  121. .Lnot_short:
  122. sub dst, dst, #16/* Pre-bias. */
  123. sub count, count, #64
  124. 1:
  125. stp A_l, A_l, [dst, #16]
  126. stp A_l, A_l, [dst, #32]
  127. stp A_l, A_l, [dst, #48]
  128. stp A_l, A_l, [dst, #64]!
  129. subs count, count, #64
  130. b.ge 1b
  131. tst count, #0x3f
  132. add dst, dst, #16
  133. b.ne .Ltail63
  134. .Lexitfunc:
  135. ret
  136. /*
  137. * For zeroing memory, check to see if we can use the ZVA feature to
  138. * zero entire 'cache' lines.
  139. */
  140. .Lzero_mem:
  141. cmp count, #63
  142. b.le .Ltail63
  143. /*
  144. * For zeroing small amounts of memory, it's not worth setting up
  145. * the line-clear code.
  146. */
  147. cmp count, #128
  148. b.lt .Lnot_short /*count is at least 128 bytes*/
  149. mrs tmp1, dczid_el0
  150. tbnz tmp1, #4, .Lnot_short
  151. mov tmp3w, #4
  152. and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
  153. lsl zva_len, tmp3w, zva_len
  154. ands tmp3w, zva_len, #63
  155. /*
  156. * ensure the zva_len is not less than 64.
  157. * It is not meaningful to use ZVA if the block size is less than 64.
  158. */
  159. b.ne .Lnot_short
  160. .Lzero_by_line:
  161. /*
  162. * Compute how far we need to go to become suitably aligned. We're
  163. * already at quad-word alignment.
  164. */
  165. cmp count, zva_len_x
  166. b.lt .Lnot_short /* Not enough to reach alignment. */
  167. sub zva_bits_x, zva_len_x, #1
  168. neg tmp2, dst
  169. ands tmp2, tmp2, zva_bits_x
  170. b.eq 2f /* Already aligned. */
  171. /* Not aligned, check that there's enough to copy after alignment.*/
  172. sub tmp1, count, tmp2
  173. /*
  174. * grantee the remain length to be ZVA is bigger than 64,
  175. * avoid to make the 2f's process over mem range.*/
  176. cmp tmp1, #64
  177. ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
  178. b.lt .Lnot_short
  179. /*
  180. * We know that there's at least 64 bytes to zero and that it's safe
  181. * to overrun by 64 bytes.
  182. */
  183. mov count, tmp1
  184. 1:
  185. stp A_l, A_l, [dst]
  186. stp A_l, A_l, [dst, #16]
  187. stp A_l, A_l, [dst, #32]
  188. subs tmp2, tmp2, #64
  189. stp A_l, A_l, [dst, #48]
  190. add dst, dst, #64
  191. b.ge 1b
  192. /* We've overrun a bit, so adjust dst downwards.*/
  193. add dst, dst, tmp2
  194. 2:
  195. sub count, count, zva_len_x
  196. 3:
  197. dc zva, dst
  198. add dst, dst, zva_len_x
  199. subs count, count, zva_len_x
  200. b.ge 3b
  201. ands count, count, zva_bits_x
  202. b.ne .Ltail_maybe_long
  203. ret
  204. ENDPIPROC(memset)
  205. ENDPROC(__memset)