copyuser_power7.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. *
  4. * Copyright (C) IBM Corporation, 2011
  5. *
  6. * Author: Anton Blanchard <anton@au.ibm.com>
  7. */
  8. #include <asm/ppc_asm.h>
  9. #ifndef SELFTEST_CASE
  10. /* 0 == don't use VMX, 1 == use VMX */
  11. #define SELFTEST_CASE 0
  12. #endif
  13. #ifdef __BIG_ENDIAN__
  14. #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
  15. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
  16. #else
  17. #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
  18. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
  19. #endif
  20. .macro err1
  21. 100:
  22. EX_TABLE(100b,.Ldo_err1)
  23. .endm
  24. .macro err2
  25. 200:
  26. EX_TABLE(200b,.Ldo_err2)
  27. .endm
  28. #ifdef CONFIG_ALTIVEC
  29. .macro err3
  30. 300:
  31. EX_TABLE(300b,.Ldo_err3)
  32. .endm
  33. .macro err4
  34. 400:
  35. EX_TABLE(400b,.Ldo_err4)
  36. .endm
  37. .Ldo_err4:
  38. ld r16,STK_REG(R16)(r1)
  39. ld r15,STK_REG(R15)(r1)
  40. ld r14,STK_REG(R14)(r1)
  41. .Ldo_err3:
  42. bl CFUNC(exit_vmx_usercopy)
  43. ld r0,STACKFRAMESIZE+16(r1)
  44. mtlr r0
  45. b .Lexit
  46. #endif /* CONFIG_ALTIVEC */
  47. .Ldo_err2:
  48. ld r22,STK_REG(R22)(r1)
  49. ld r21,STK_REG(R21)(r1)
  50. ld r20,STK_REG(R20)(r1)
  51. ld r19,STK_REG(R19)(r1)
  52. ld r18,STK_REG(R18)(r1)
  53. ld r17,STK_REG(R17)(r1)
  54. ld r16,STK_REG(R16)(r1)
  55. ld r15,STK_REG(R15)(r1)
  56. ld r14,STK_REG(R14)(r1)
  57. .Lexit:
  58. addi r1,r1,STACKFRAMESIZE
  59. .Ldo_err1:
  60. ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  61. ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  62. ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  63. b __copy_tofrom_user_base
  64. _GLOBAL(__copy_tofrom_user_power7)
  65. cmpldi r5,16
  66. cmpldi cr1,r5,3328
  67. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  68. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  69. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  70. blt .Lshort_copy
  71. #ifdef CONFIG_ALTIVEC
  72. test_feature = SELFTEST_CASE
  73. BEGIN_FTR_SECTION
  74. bgt cr1,.Lvmx_copy
  75. END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  76. #endif
  77. .Lnonvmx_copy:
  78. /* Get the source 8B aligned */
  79. neg r6,r4
  80. mtocrf 0x01,r6
  81. clrldi r6,r6,(64-3)
  82. bf cr7*4+3,1f
  83. err1; lbz r0,0(r4)
  84. addi r4,r4,1
  85. err1; stb r0,0(r3)
  86. addi r3,r3,1
  87. 1: bf cr7*4+2,2f
  88. err1; lhz r0,0(r4)
  89. addi r4,r4,2
  90. err1; sth r0,0(r3)
  91. addi r3,r3,2
  92. 2: bf cr7*4+1,3f
  93. err1; lwz r0,0(r4)
  94. addi r4,r4,4
  95. err1; stw r0,0(r3)
  96. addi r3,r3,4
  97. 3: sub r5,r5,r6
  98. cmpldi r5,128
  99. blt 5f
  100. mflr r0
  101. stdu r1,-STACKFRAMESIZE(r1)
  102. std r14,STK_REG(R14)(r1)
  103. std r15,STK_REG(R15)(r1)
  104. std r16,STK_REG(R16)(r1)
  105. std r17,STK_REG(R17)(r1)
  106. std r18,STK_REG(R18)(r1)
  107. std r19,STK_REG(R19)(r1)
  108. std r20,STK_REG(R20)(r1)
  109. std r21,STK_REG(R21)(r1)
  110. std r22,STK_REG(R22)(r1)
  111. std r0,STACKFRAMESIZE+16(r1)
  112. srdi r6,r5,7
  113. mtctr r6
  114. /* Now do cacheline (128B) sized loads and stores. */
  115. .align 5
  116. 4:
  117. err2; ld r0,0(r4)
  118. err2; ld r6,8(r4)
  119. err2; ld r7,16(r4)
  120. err2; ld r8,24(r4)
  121. err2; ld r9,32(r4)
  122. err2; ld r10,40(r4)
  123. err2; ld r11,48(r4)
  124. err2; ld r12,56(r4)
  125. err2; ld r14,64(r4)
  126. err2; ld r15,72(r4)
  127. err2; ld r16,80(r4)
  128. err2; ld r17,88(r4)
  129. err2; ld r18,96(r4)
  130. err2; ld r19,104(r4)
  131. err2; ld r20,112(r4)
  132. err2; ld r21,120(r4)
  133. addi r4,r4,128
  134. err2; std r0,0(r3)
  135. err2; std r6,8(r3)
  136. err2; std r7,16(r3)
  137. err2; std r8,24(r3)
  138. err2; std r9,32(r3)
  139. err2; std r10,40(r3)
  140. err2; std r11,48(r3)
  141. err2; std r12,56(r3)
  142. err2; std r14,64(r3)
  143. err2; std r15,72(r3)
  144. err2; std r16,80(r3)
  145. err2; std r17,88(r3)
  146. err2; std r18,96(r3)
  147. err2; std r19,104(r3)
  148. err2; std r20,112(r3)
  149. err2; std r21,120(r3)
  150. addi r3,r3,128
  151. bdnz 4b
  152. clrldi r5,r5,(64-7)
  153. ld r14,STK_REG(R14)(r1)
  154. ld r15,STK_REG(R15)(r1)
  155. ld r16,STK_REG(R16)(r1)
  156. ld r17,STK_REG(R17)(r1)
  157. ld r18,STK_REG(R18)(r1)
  158. ld r19,STK_REG(R19)(r1)
  159. ld r20,STK_REG(R20)(r1)
  160. ld r21,STK_REG(R21)(r1)
  161. ld r22,STK_REG(R22)(r1)
  162. addi r1,r1,STACKFRAMESIZE
  163. /* Up to 127B to go */
  164. 5: srdi r6,r5,4
  165. mtocrf 0x01,r6
  166. 6: bf cr7*4+1,7f
  167. err1; ld r0,0(r4)
  168. err1; ld r6,8(r4)
  169. err1; ld r7,16(r4)
  170. err1; ld r8,24(r4)
  171. err1; ld r9,32(r4)
  172. err1; ld r10,40(r4)
  173. err1; ld r11,48(r4)
  174. err1; ld r12,56(r4)
  175. addi r4,r4,64
  176. err1; std r0,0(r3)
  177. err1; std r6,8(r3)
  178. err1; std r7,16(r3)
  179. err1; std r8,24(r3)
  180. err1; std r9,32(r3)
  181. err1; std r10,40(r3)
  182. err1; std r11,48(r3)
  183. err1; std r12,56(r3)
  184. addi r3,r3,64
  185. /* Up to 63B to go */
  186. 7: bf cr7*4+2,8f
  187. err1; ld r0,0(r4)
  188. err1; ld r6,8(r4)
  189. err1; ld r7,16(r4)
  190. err1; ld r8,24(r4)
  191. addi r4,r4,32
  192. err1; std r0,0(r3)
  193. err1; std r6,8(r3)
  194. err1; std r7,16(r3)
  195. err1; std r8,24(r3)
  196. addi r3,r3,32
  197. /* Up to 31B to go */
  198. 8: bf cr7*4+3,9f
  199. err1; ld r0,0(r4)
  200. err1; ld r6,8(r4)
  201. addi r4,r4,16
  202. err1; std r0,0(r3)
  203. err1; std r6,8(r3)
  204. addi r3,r3,16
  205. 9: clrldi r5,r5,(64-4)
  206. /* Up to 15B to go */
  207. .Lshort_copy:
  208. mtocrf 0x01,r5
  209. bf cr7*4+0,12f
  210. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  211. err1; lwz r6,4(r4)
  212. addi r4,r4,8
  213. err1; stw r0,0(r3)
  214. err1; stw r6,4(r3)
  215. addi r3,r3,8
  216. 12: bf cr7*4+1,13f
  217. err1; lwz r0,0(r4)
  218. addi r4,r4,4
  219. err1; stw r0,0(r3)
  220. addi r3,r3,4
  221. 13: bf cr7*4+2,14f
  222. err1; lhz r0,0(r4)
  223. addi r4,r4,2
  224. err1; sth r0,0(r3)
  225. addi r3,r3,2
  226. 14: bf cr7*4+3,15f
  227. err1; lbz r0,0(r4)
  228. err1; stb r0,0(r3)
  229. 15: li r3,0
  230. blr
  231. .Lunwind_stack_nonvmx_copy:
  232. addi r1,r1,STACKFRAMESIZE
  233. b .Lnonvmx_copy
  234. .Lvmx_copy:
  235. #ifdef CONFIG_ALTIVEC
  236. mflr r0
  237. std r0,16(r1)
  238. stdu r1,-STACKFRAMESIZE(r1)
  239. bl CFUNC(enter_vmx_usercopy)
  240. cmpwi cr1,r3,0
  241. ld r0,STACKFRAMESIZE+16(r1)
  242. ld r3,STK_REG(R31)(r1)
  243. ld r4,STK_REG(R30)(r1)
  244. ld r5,STK_REG(R29)(r1)
  245. mtlr r0
  246. /*
  247. * We prefetch both the source and destination using enhanced touch
  248. * instructions. We use a stream ID of 0 for the load side and
  249. * 1 for the store side.
  250. */
  251. clrrdi r6,r4,7
  252. clrrdi r9,r3,7
  253. ori r9,r9,1 /* stream=1 */
  254. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  255. cmpldi r7,0x3FF
  256. ble 1f
  257. li r7,0x3FF
  258. 1: lis r0,0x0E00 /* depth=7 */
  259. sldi r7,r7,7
  260. or r7,r7,r0
  261. ori r10,r7,1 /* stream=1 */
  262. DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
  263. beq cr1,.Lunwind_stack_nonvmx_copy
  264. /*
  265. * If source and destination are not relatively aligned we use a
  266. * slower permute loop.
  267. */
  268. xor r6,r4,r3
  269. rldicl. r6,r6,0,(64-4)
  270. bne .Lvmx_unaligned_copy
  271. /* Get the destination 16B aligned */
  272. neg r6,r3
  273. mtocrf 0x01,r6
  274. clrldi r6,r6,(64-4)
  275. bf cr7*4+3,1f
  276. err3; lbz r0,0(r4)
  277. addi r4,r4,1
  278. err3; stb r0,0(r3)
  279. addi r3,r3,1
  280. 1: bf cr7*4+2,2f
  281. err3; lhz r0,0(r4)
  282. addi r4,r4,2
  283. err3; sth r0,0(r3)
  284. addi r3,r3,2
  285. 2: bf cr7*4+1,3f
  286. err3; lwz r0,0(r4)
  287. addi r4,r4,4
  288. err3; stw r0,0(r3)
  289. addi r3,r3,4
  290. 3: bf cr7*4+0,4f
  291. err3; ld r0,0(r4)
  292. addi r4,r4,8
  293. err3; std r0,0(r3)
  294. addi r3,r3,8
  295. 4: sub r5,r5,r6
  296. /* Get the desination 128B aligned */
  297. neg r6,r3
  298. srdi r7,r6,4
  299. mtocrf 0x01,r7
  300. clrldi r6,r6,(64-7)
  301. li r9,16
  302. li r10,32
  303. li r11,48
  304. bf cr7*4+3,5f
  305. err3; lvx v1,0,r4
  306. addi r4,r4,16
  307. err3; stvx v1,0,r3
  308. addi r3,r3,16
  309. 5: bf cr7*4+2,6f
  310. err3; lvx v1,0,r4
  311. err3; lvx v0,r4,r9
  312. addi r4,r4,32
  313. err3; stvx v1,0,r3
  314. err3; stvx v0,r3,r9
  315. addi r3,r3,32
  316. 6: bf cr7*4+1,7f
  317. err3; lvx v3,0,r4
  318. err3; lvx v2,r4,r9
  319. err3; lvx v1,r4,r10
  320. err3; lvx v0,r4,r11
  321. addi r4,r4,64
  322. err3; stvx v3,0,r3
  323. err3; stvx v2,r3,r9
  324. err3; stvx v1,r3,r10
  325. err3; stvx v0,r3,r11
  326. addi r3,r3,64
  327. 7: sub r5,r5,r6
  328. srdi r6,r5,7
  329. std r14,STK_REG(R14)(r1)
  330. std r15,STK_REG(R15)(r1)
  331. std r16,STK_REG(R16)(r1)
  332. li r12,64
  333. li r14,80
  334. li r15,96
  335. li r16,112
  336. mtctr r6
  337. /*
  338. * Now do cacheline sized loads and stores. By this stage the
  339. * cacheline stores are also cacheline aligned.
  340. */
  341. .align 5
  342. 8:
  343. err4; lvx v7,0,r4
  344. err4; lvx v6,r4,r9
  345. err4; lvx v5,r4,r10
  346. err4; lvx v4,r4,r11
  347. err4; lvx v3,r4,r12
  348. err4; lvx v2,r4,r14
  349. err4; lvx v1,r4,r15
  350. err4; lvx v0,r4,r16
  351. addi r4,r4,128
  352. err4; stvx v7,0,r3
  353. err4; stvx v6,r3,r9
  354. err4; stvx v5,r3,r10
  355. err4; stvx v4,r3,r11
  356. err4; stvx v3,r3,r12
  357. err4; stvx v2,r3,r14
  358. err4; stvx v1,r3,r15
  359. err4; stvx v0,r3,r16
  360. addi r3,r3,128
  361. bdnz 8b
  362. ld r14,STK_REG(R14)(r1)
  363. ld r15,STK_REG(R15)(r1)
  364. ld r16,STK_REG(R16)(r1)
  365. /* Up to 127B to go */
  366. clrldi r5,r5,(64-7)
  367. srdi r6,r5,4
  368. mtocrf 0x01,r6
  369. bf cr7*4+1,9f
  370. err3; lvx v3,0,r4
  371. err3; lvx v2,r4,r9
  372. err3; lvx v1,r4,r10
  373. err3; lvx v0,r4,r11
  374. addi r4,r4,64
  375. err3; stvx v3,0,r3
  376. err3; stvx v2,r3,r9
  377. err3; stvx v1,r3,r10
  378. err3; stvx v0,r3,r11
  379. addi r3,r3,64
  380. 9: bf cr7*4+2,10f
  381. err3; lvx v1,0,r4
  382. err3; lvx v0,r4,r9
  383. addi r4,r4,32
  384. err3; stvx v1,0,r3
  385. err3; stvx v0,r3,r9
  386. addi r3,r3,32
  387. 10: bf cr7*4+3,11f
  388. err3; lvx v1,0,r4
  389. addi r4,r4,16
  390. err3; stvx v1,0,r3
  391. addi r3,r3,16
  392. /* Up to 15B to go */
  393. 11: clrldi r5,r5,(64-4)
  394. mtocrf 0x01,r5
  395. bf cr7*4+0,12f
  396. err3; ld r0,0(r4)
  397. addi r4,r4,8
  398. err3; std r0,0(r3)
  399. addi r3,r3,8
  400. 12: bf cr7*4+1,13f
  401. err3; lwz r0,0(r4)
  402. addi r4,r4,4
  403. err3; stw r0,0(r3)
  404. addi r3,r3,4
  405. 13: bf cr7*4+2,14f
  406. err3; lhz r0,0(r4)
  407. addi r4,r4,2
  408. err3; sth r0,0(r3)
  409. addi r3,r3,2
  410. 14: bf cr7*4+3,15f
  411. err3; lbz r0,0(r4)
  412. err3; stb r0,0(r3)
  413. 15: addi r1,r1,STACKFRAMESIZE
  414. b CFUNC(exit_vmx_usercopy) /* tail call optimise */
  415. .Lvmx_unaligned_copy:
  416. /* Get the destination 16B aligned */
  417. neg r6,r3
  418. mtocrf 0x01,r6
  419. clrldi r6,r6,(64-4)
  420. bf cr7*4+3,1f
  421. err3; lbz r0,0(r4)
  422. addi r4,r4,1
  423. err3; stb r0,0(r3)
  424. addi r3,r3,1
  425. 1: bf cr7*4+2,2f
  426. err3; lhz r0,0(r4)
  427. addi r4,r4,2
  428. err3; sth r0,0(r3)
  429. addi r3,r3,2
  430. 2: bf cr7*4+1,3f
  431. err3; lwz r0,0(r4)
  432. addi r4,r4,4
  433. err3; stw r0,0(r3)
  434. addi r3,r3,4
  435. 3: bf cr7*4+0,4f
  436. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  437. err3; lwz r7,4(r4)
  438. addi r4,r4,8
  439. err3; stw r0,0(r3)
  440. err3; stw r7,4(r3)
  441. addi r3,r3,8
  442. 4: sub r5,r5,r6
  443. /* Get the desination 128B aligned */
  444. neg r6,r3
  445. srdi r7,r6,4
  446. mtocrf 0x01,r7
  447. clrldi r6,r6,(64-7)
  448. li r9,16
  449. li r10,32
  450. li r11,48
  451. LVS(v16,0,r4) /* Setup permute control vector */
  452. err3; lvx v0,0,r4
  453. addi r4,r4,16
  454. bf cr7*4+3,5f
  455. err3; lvx v1,0,r4
  456. VPERM(v8,v0,v1,v16)
  457. addi r4,r4,16
  458. err3; stvx v8,0,r3
  459. addi r3,r3,16
  460. vor v0,v1,v1
  461. 5: bf cr7*4+2,6f
  462. err3; lvx v1,0,r4
  463. VPERM(v8,v0,v1,v16)
  464. err3; lvx v0,r4,r9
  465. VPERM(v9,v1,v0,v16)
  466. addi r4,r4,32
  467. err3; stvx v8,0,r3
  468. err3; stvx v9,r3,r9
  469. addi r3,r3,32
  470. 6: bf cr7*4+1,7f
  471. err3; lvx v3,0,r4
  472. VPERM(v8,v0,v3,v16)
  473. err3; lvx v2,r4,r9
  474. VPERM(v9,v3,v2,v16)
  475. err3; lvx v1,r4,r10
  476. VPERM(v10,v2,v1,v16)
  477. err3; lvx v0,r4,r11
  478. VPERM(v11,v1,v0,v16)
  479. addi r4,r4,64
  480. err3; stvx v8,0,r3
  481. err3; stvx v9,r3,r9
  482. err3; stvx v10,r3,r10
  483. err3; stvx v11,r3,r11
  484. addi r3,r3,64
  485. 7: sub r5,r5,r6
  486. srdi r6,r5,7
  487. std r14,STK_REG(R14)(r1)
  488. std r15,STK_REG(R15)(r1)
  489. std r16,STK_REG(R16)(r1)
  490. li r12,64
  491. li r14,80
  492. li r15,96
  493. li r16,112
  494. mtctr r6
  495. /*
  496. * Now do cacheline sized loads and stores. By this stage the
  497. * cacheline stores are also cacheline aligned.
  498. */
  499. .align 5
  500. 8:
  501. err4; lvx v7,0,r4
  502. VPERM(v8,v0,v7,v16)
  503. err4; lvx v6,r4,r9
  504. VPERM(v9,v7,v6,v16)
  505. err4; lvx v5,r4,r10
  506. VPERM(v10,v6,v5,v16)
  507. err4; lvx v4,r4,r11
  508. VPERM(v11,v5,v4,v16)
  509. err4; lvx v3,r4,r12
  510. VPERM(v12,v4,v3,v16)
  511. err4; lvx v2,r4,r14
  512. VPERM(v13,v3,v2,v16)
  513. err4; lvx v1,r4,r15
  514. VPERM(v14,v2,v1,v16)
  515. err4; lvx v0,r4,r16
  516. VPERM(v15,v1,v0,v16)
  517. addi r4,r4,128
  518. err4; stvx v8,0,r3
  519. err4; stvx v9,r3,r9
  520. err4; stvx v10,r3,r10
  521. err4; stvx v11,r3,r11
  522. err4; stvx v12,r3,r12
  523. err4; stvx v13,r3,r14
  524. err4; stvx v14,r3,r15
  525. err4; stvx v15,r3,r16
  526. addi r3,r3,128
  527. bdnz 8b
  528. ld r14,STK_REG(R14)(r1)
  529. ld r15,STK_REG(R15)(r1)
  530. ld r16,STK_REG(R16)(r1)
  531. /* Up to 127B to go */
  532. clrldi r5,r5,(64-7)
  533. srdi r6,r5,4
  534. mtocrf 0x01,r6
  535. bf cr7*4+1,9f
  536. err3; lvx v3,0,r4
  537. VPERM(v8,v0,v3,v16)
  538. err3; lvx v2,r4,r9
  539. VPERM(v9,v3,v2,v16)
  540. err3; lvx v1,r4,r10
  541. VPERM(v10,v2,v1,v16)
  542. err3; lvx v0,r4,r11
  543. VPERM(v11,v1,v0,v16)
  544. addi r4,r4,64
  545. err3; stvx v8,0,r3
  546. err3; stvx v9,r3,r9
  547. err3; stvx v10,r3,r10
  548. err3; stvx v11,r3,r11
  549. addi r3,r3,64
  550. 9: bf cr7*4+2,10f
  551. err3; lvx v1,0,r4
  552. VPERM(v8,v0,v1,v16)
  553. err3; lvx v0,r4,r9
  554. VPERM(v9,v1,v0,v16)
  555. addi r4,r4,32
  556. err3; stvx v8,0,r3
  557. err3; stvx v9,r3,r9
  558. addi r3,r3,32
  559. 10: bf cr7*4+3,11f
  560. err3; lvx v1,0,r4
  561. VPERM(v8,v0,v1,v16)
  562. addi r4,r4,16
  563. err3; stvx v8,0,r3
  564. addi r3,r3,16
  565. /* Up to 15B to go */
  566. 11: clrldi r5,r5,(64-4)
  567. addi r4,r4,-16 /* Unwind the +16 load offset */
  568. mtocrf 0x01,r5
  569. bf cr7*4+0,12f
  570. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  571. err3; lwz r6,4(r4)
  572. addi r4,r4,8
  573. err3; stw r0,0(r3)
  574. err3; stw r6,4(r3)
  575. addi r3,r3,8
  576. 12: bf cr7*4+1,13f
  577. err3; lwz r0,0(r4)
  578. addi r4,r4,4
  579. err3; stw r0,0(r3)
  580. addi r3,r3,4
  581. 13: bf cr7*4+2,14f
  582. err3; lhz r0,0(r4)
  583. addi r4,r4,2
  584. err3; sth r0,0(r3)
  585. addi r3,r3,2
  586. 14: bf cr7*4+3,15f
  587. err3; lbz r0,0(r4)
  588. err3; stb r0,0(r3)
  589. 15: addi r1,r1,STACKFRAMESIZE
  590. b CFUNC(exit_vmx_usercopy) /* tail call optimise */
  591. #endif /* CONFIG_ALTIVEC */