memmove.S 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
  4. */
  5. #include <linux/linkage.h>
  6. #include <asm/asm.h>
  7. SYM_FUNC_START(__memmove)
  8. /*
  9. * Returns
  10. * a0 - dest
  11. *
  12. * Parameters
  13. * a0 - Inclusive first byte of dest
  14. * a1 - Inclusive first byte of src
  15. * a2 - Length of copy n
  16. *
  17. * Because the return matches the parameter register a0,
  18. * we will not clobber or modify that register.
  19. *
  20. * Note: This currently only works on little-endian.
  21. * To port to big-endian, reverse the direction of shifts
  22. * in the 2 misaligned fixup copy loops.
  23. */
  24. /* Return if nothing to do */
  25. beq a0, a1, .Lreturn_from_memmove
  26. beqz a2, .Lreturn_from_memmove
  27. /*
  28. * Register Uses
  29. * Forward Copy: a1 - Index counter of src
  30. * Reverse Copy: a4 - Index counter of src
  31. * Forward Copy: t3 - Index counter of dest
  32. * Reverse Copy: t4 - Index counter of dest
  33. * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
  34. * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
  35. * Both Copy Modes: t0 - Link / Temporary for load-store
  36. * Both Copy Modes: t1 - Temporary for load-store
  37. * Both Copy Modes: t2 - Temporary for load-store
  38. * Both Copy Modes: a5 - dest to src alignment offset
  39. * Both Copy Modes: a6 - Shift ammount
  40. * Both Copy Modes: a7 - Inverse Shift ammount
  41. * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
  42. */
  43. /*
  44. * Solve for some register values now.
  45. * Byte copy does not need t5 or t6.
  46. */
  47. mv t3, a0
  48. add t4, a0, a2
  49. add a4, a1, a2
  50. /*
  51. * Byte copy if copying less than (2 * SZREG) bytes. This can
  52. * cause problems with the bulk copy implementation and is
  53. * small enough not to bother.
  54. */
  55. andi t0, a2, -(2 * SZREG)
  56. beqz t0, .Lbyte_copy
  57. /*
  58. * Now solve for t5 and t6.
  59. */
  60. andi t5, t3, -SZREG
  61. andi t6, t4, -SZREG
  62. /*
  63. * If dest(Register t3) rounded down to the nearest naturally
  64. * aligned SZREG address, does not equal dest, then add SZREG
  65. * to find the low-bound of SZREG alignment in the dest memory
  66. * region. Note that this could overshoot the dest memory
  67. * region if n is less than SZREG. This is one reason why
  68. * we always byte copy if n is less than SZREG.
  69. * Otherwise, dest is already naturally aligned to SZREG.
  70. */
  71. beq t5, t3, 1f
  72. addi t5, t5, SZREG
  73. 1:
  74. /*
  75. * If the dest and src are co-aligned to SZREG, then there is
  76. * no need for the full rigmarole of a full misaligned fixup copy.
  77. * Instead, do a simpler co-aligned copy.
  78. */
  79. xor t0, a0, a1
  80. andi t1, t0, (SZREG - 1)
  81. beqz t1, .Lcoaligned_copy
  82. /* Fall through to misaligned fixup copy */
  83. .Lmisaligned_fixup_copy:
  84. bltu a1, a0, .Lmisaligned_fixup_copy_reverse
  85. .Lmisaligned_fixup_copy_forward:
  86. jal t0, .Lbyte_copy_until_aligned_forward
  87. andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
  88. slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
  89. sub a5, a1, t3 /* Find the difference between src and dest */
  90. andi a1, a1, -SZREG /* Align the src pointer */
  91. addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
  92. /*
  93. * Compute The Inverse Shift
  94. * a7 = XLEN - a6 = XLEN + -a6
  95. * 2s complement negation to find the negative: -a6 = ~a6 + 1
  96. * Add that to XLEN. XLEN = SZREG * 8.
  97. */
  98. not a7, a6
  99. addi a7, a7, (SZREG * 8 + 1)
  100. /*
  101. * Fix Misalignment Copy Loop - Forward
  102. * load_val0 = load_ptr[0];
  103. * do {
  104. * load_val1 = load_ptr[1];
  105. * store_ptr += 2;
  106. * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
  107. *
  108. * if (store_ptr == {a2})
  109. * break;
  110. *
  111. * load_val0 = load_ptr[2];
  112. * load_ptr += 2;
  113. * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
  114. *
  115. * } while (store_ptr != store_ptr_end);
  116. * store_ptr = store_ptr_end;
  117. */
  118. REG_L t0, (0 * SZREG)(a1)
  119. 1:
  120. REG_L t1, (1 * SZREG)(a1)
  121. addi t3, t3, (2 * SZREG)
  122. srl t0, t0, a6
  123. sll t2, t1, a7
  124. or t2, t0, t2
  125. REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
  126. beq t3, a2, 2f
  127. REG_L t0, (2 * SZREG)(a1)
  128. addi a1, a1, (2 * SZREG)
  129. srl t1, t1, a6
  130. sll t2, t0, a7
  131. or t2, t1, t2
  132. REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
  133. bne t3, t6, 1b
  134. 2:
  135. mv t3, t6 /* Fix the dest pointer in case the loop was broken */
  136. add a1, t3, a5 /* Restore the src pointer */
  137. j .Lbyte_copy_forward /* Copy any remaining bytes */
  138. .Lmisaligned_fixup_copy_reverse:
  139. jal t0, .Lbyte_copy_until_aligned_reverse
  140. andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
  141. slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
  142. sub a5, a4, t4 /* Find the difference between src and dest */
  143. andi a4, a4, -SZREG /* Align the src pointer */
  144. addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
  145. /*
  146. * Compute The Inverse Shift
  147. * a7 = XLEN - a6 = XLEN + -a6
  148. * 2s complement negation to find the negative: -a6 = ~a6 + 1
  149. * Add that to XLEN. XLEN = SZREG * 8.
  150. */
  151. not a7, a6
  152. addi a7, a7, (SZREG * 8 + 1)
  153. /*
  154. * Fix Misalignment Copy Loop - Reverse
  155. * load_val1 = load_ptr[0];
  156. * do {
  157. * load_val0 = load_ptr[-1];
  158. * store_ptr -= 2;
  159. * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
  160. *
  161. * if (store_ptr == {a2})
  162. * break;
  163. *
  164. * load_val1 = load_ptr[-2];
  165. * load_ptr -= 2;
  166. * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
  167. *
  168. * } while (store_ptr != store_ptr_end);
  169. * store_ptr = store_ptr_end;
  170. */
  171. REG_L t1, ( 0 * SZREG)(a4)
  172. 1:
  173. REG_L t0, (-1 * SZREG)(a4)
  174. addi t4, t4, (-2 * SZREG)
  175. sll t1, t1, a7
  176. srl t2, t0, a6
  177. or t2, t1, t2
  178. REG_S t2, ( 1 * SZREG)(t4)
  179. beq t4, a2, 2f
  180. REG_L t1, (-2 * SZREG)(a4)
  181. addi a4, a4, (-2 * SZREG)
  182. sll t0, t0, a7
  183. srl t2, t1, a6
  184. or t2, t0, t2
  185. REG_S t2, ( 0 * SZREG)(t4)
  186. bne t4, t5, 1b
  187. 2:
  188. mv t4, t5 /* Fix the dest pointer in case the loop was broken */
  189. add a4, t4, a5 /* Restore the src pointer */
  190. j .Lbyte_copy_reverse /* Copy any remaining bytes */
  191. /*
  192. * Simple copy loops for SZREG co-aligned memory locations.
  193. * These also make calls to do byte copies for any unaligned
  194. * data at their terminations.
  195. */
  196. .Lcoaligned_copy:
  197. bltu a1, a0, .Lcoaligned_copy_reverse
  198. .Lcoaligned_copy_forward:
  199. jal t0, .Lbyte_copy_until_aligned_forward
  200. 1:
  201. REG_L t1, ( 0 * SZREG)(a1)
  202. addi a1, a1, SZREG
  203. addi t3, t3, SZREG
  204. REG_S t1, (-1 * SZREG)(t3)
  205. bne t3, t6, 1b
  206. j .Lbyte_copy_forward /* Copy any remaining bytes */
  207. .Lcoaligned_copy_reverse:
  208. jal t0, .Lbyte_copy_until_aligned_reverse
  209. 1:
  210. REG_L t1, (-1 * SZREG)(a4)
  211. addi a4, a4, -SZREG
  212. addi t4, t4, -SZREG
  213. REG_S t1, ( 0 * SZREG)(t4)
  214. bne t4, t5, 1b
  215. j .Lbyte_copy_reverse /* Copy any remaining bytes */
  216. /*
  217. * These are basically sub-functions within the function. They
  218. * are used to byte copy until the dest pointer is in alignment.
  219. * At which point, a bulk copy method can be used by the
  220. * calling code. These work on the same registers as the bulk
  221. * copy loops. Therefore, the register values can be picked
  222. * up from where they were left and we avoid code duplication
  223. * without any overhead except the call in and return jumps.
  224. */
  225. .Lbyte_copy_until_aligned_forward:
  226. beq t3, t5, 2f
  227. 1:
  228. lb t1, 0(a1)
  229. addi a1, a1, 1
  230. addi t3, t3, 1
  231. sb t1, -1(t3)
  232. bne t3, t5, 1b
  233. 2:
  234. jalr zero, 0x0(t0) /* Return to multibyte copy loop */
  235. .Lbyte_copy_until_aligned_reverse:
  236. beq t4, t6, 2f
  237. 1:
  238. lb t1, -1(a4)
  239. addi a4, a4, -1
  240. addi t4, t4, -1
  241. sb t1, 0(t4)
  242. bne t4, t6, 1b
  243. 2:
  244. jalr zero, 0x0(t0) /* Return to multibyte copy loop */
  245. /*
  246. * Simple byte copy loops.
  247. * These will byte copy until they reach the end of data to copy.
  248. * At that point, they will call to return from memmove.
  249. */
  250. .Lbyte_copy:
  251. bltu a1, a0, .Lbyte_copy_reverse
  252. .Lbyte_copy_forward:
  253. beq t3, t4, 2f
  254. 1:
  255. lb t1, 0(a1)
  256. addi a1, a1, 1
  257. addi t3, t3, 1
  258. sb t1, -1(t3)
  259. bne t3, t4, 1b
  260. 2:
  261. ret
  262. .Lbyte_copy_reverse:
  263. beq t4, t3, 2f
  264. 1:
  265. lb t1, -1(a4)
  266. addi a4, a4, -1
  267. addi t4, t4, -1
  268. sb t1, 0(t4)
  269. bne t4, t3, 1b
  270. 2:
  271. .Lreturn_from_memmove:
  272. ret
  273. SYM_FUNC_END(__memmove)
  274. SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
  275. SYM_FUNC_ALIAS(__pi_memmove, __memmove)
  276. SYM_FUNC_ALIAS(__pi___memmove, __memmove)