chacha20-avx2-x86_64.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /*
  2. * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
  3. *
  4. * Copyright (C) 2015 Martin Willi
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. */
  11. #include <linux/linkage.h>
  12. .section .rodata.cst32.ROT8, "aM", @progbits, 32
  13. .align 32
  14. ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
  15. .octa 0x0e0d0c0f0a09080b0605040702010003
  16. .section .rodata.cst32.ROT16, "aM", @progbits, 32
  17. .align 32
  18. ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
  19. .octa 0x0d0c0f0e09080b0a0504070601000302
  20. .section .rodata.cst32.CTRINC, "aM", @progbits, 32
  21. .align 32
  22. CTRINC: .octa 0x00000003000000020000000100000000
  23. .octa 0x00000007000000060000000500000004
  24. .text
  25. ENTRY(chacha20_8block_xor_avx2)
  26. # %rdi: Input state matrix, s
  27. # %rsi: 8 data blocks output, o
  28. # %rdx: 8 data blocks input, i
  29. # This function encrypts eight consecutive ChaCha20 blocks by loading
  30. # the state matrix in AVX registers eight times. As we need some
  31. # scratch registers, we save the first four registers on the stack. The
  32. # algorithm performs each operation on the corresponding word of each
  33. # state matrix, hence requires no word shuffling. For final XORing step
  34. # we transpose the matrix by interleaving 32-, 64- and then 128-bit
  35. # words, which allows us to do XOR in AVX registers. 8/16-bit word
  36. # rotation is done with the slightly better performing byte shuffling,
  37. # 7/12-bit word rotation uses traditional shift+OR.
  38. vzeroupper
  39. # 4 * 32 byte stack, 32-byte aligned
  40. lea 8(%rsp),%r10
  41. and $~31, %rsp
  42. sub $0x80, %rsp
  43. # x0..15[0-7] = s[0..15]
  44. vpbroadcastd 0x00(%rdi),%ymm0
  45. vpbroadcastd 0x04(%rdi),%ymm1
  46. vpbroadcastd 0x08(%rdi),%ymm2
  47. vpbroadcastd 0x0c(%rdi),%ymm3
  48. vpbroadcastd 0x10(%rdi),%ymm4
  49. vpbroadcastd 0x14(%rdi),%ymm5
  50. vpbroadcastd 0x18(%rdi),%ymm6
  51. vpbroadcastd 0x1c(%rdi),%ymm7
  52. vpbroadcastd 0x20(%rdi),%ymm8
  53. vpbroadcastd 0x24(%rdi),%ymm9
  54. vpbroadcastd 0x28(%rdi),%ymm10
  55. vpbroadcastd 0x2c(%rdi),%ymm11
  56. vpbroadcastd 0x30(%rdi),%ymm12
  57. vpbroadcastd 0x34(%rdi),%ymm13
  58. vpbroadcastd 0x38(%rdi),%ymm14
  59. vpbroadcastd 0x3c(%rdi),%ymm15
  60. # x0..3 on stack
  61. vmovdqa %ymm0,0x00(%rsp)
  62. vmovdqa %ymm1,0x20(%rsp)
  63. vmovdqa %ymm2,0x40(%rsp)
  64. vmovdqa %ymm3,0x60(%rsp)
  65. vmovdqa CTRINC(%rip),%ymm1
  66. vmovdqa ROT8(%rip),%ymm2
  67. vmovdqa ROT16(%rip),%ymm3
  68. # x12 += counter values 0-3
  69. vpaddd %ymm1,%ymm12,%ymm12
  70. mov $10,%ecx
  71. .Ldoubleround8:
  72. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  73. vpaddd 0x00(%rsp),%ymm4,%ymm0
  74. vmovdqa %ymm0,0x00(%rsp)
  75. vpxor %ymm0,%ymm12,%ymm12
  76. vpshufb %ymm3,%ymm12,%ymm12
  77. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  78. vpaddd 0x20(%rsp),%ymm5,%ymm0
  79. vmovdqa %ymm0,0x20(%rsp)
  80. vpxor %ymm0,%ymm13,%ymm13
  81. vpshufb %ymm3,%ymm13,%ymm13
  82. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  83. vpaddd 0x40(%rsp),%ymm6,%ymm0
  84. vmovdqa %ymm0,0x40(%rsp)
  85. vpxor %ymm0,%ymm14,%ymm14
  86. vpshufb %ymm3,%ymm14,%ymm14
  87. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  88. vpaddd 0x60(%rsp),%ymm7,%ymm0
  89. vmovdqa %ymm0,0x60(%rsp)
  90. vpxor %ymm0,%ymm15,%ymm15
  91. vpshufb %ymm3,%ymm15,%ymm15
  92. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  93. vpaddd %ymm12,%ymm8,%ymm8
  94. vpxor %ymm8,%ymm4,%ymm4
  95. vpslld $12,%ymm4,%ymm0
  96. vpsrld $20,%ymm4,%ymm4
  97. vpor %ymm0,%ymm4,%ymm4
  98. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  99. vpaddd %ymm13,%ymm9,%ymm9
  100. vpxor %ymm9,%ymm5,%ymm5
  101. vpslld $12,%ymm5,%ymm0
  102. vpsrld $20,%ymm5,%ymm5
  103. vpor %ymm0,%ymm5,%ymm5
  104. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  105. vpaddd %ymm14,%ymm10,%ymm10
  106. vpxor %ymm10,%ymm6,%ymm6
  107. vpslld $12,%ymm6,%ymm0
  108. vpsrld $20,%ymm6,%ymm6
  109. vpor %ymm0,%ymm6,%ymm6
  110. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  111. vpaddd %ymm15,%ymm11,%ymm11
  112. vpxor %ymm11,%ymm7,%ymm7
  113. vpslld $12,%ymm7,%ymm0
  114. vpsrld $20,%ymm7,%ymm7
  115. vpor %ymm0,%ymm7,%ymm7
  116. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  117. vpaddd 0x00(%rsp),%ymm4,%ymm0
  118. vmovdqa %ymm0,0x00(%rsp)
  119. vpxor %ymm0,%ymm12,%ymm12
  120. vpshufb %ymm2,%ymm12,%ymm12
  121. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  122. vpaddd 0x20(%rsp),%ymm5,%ymm0
  123. vmovdqa %ymm0,0x20(%rsp)
  124. vpxor %ymm0,%ymm13,%ymm13
  125. vpshufb %ymm2,%ymm13,%ymm13
  126. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  127. vpaddd 0x40(%rsp),%ymm6,%ymm0
  128. vmovdqa %ymm0,0x40(%rsp)
  129. vpxor %ymm0,%ymm14,%ymm14
  130. vpshufb %ymm2,%ymm14,%ymm14
  131. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  132. vpaddd 0x60(%rsp),%ymm7,%ymm0
  133. vmovdqa %ymm0,0x60(%rsp)
  134. vpxor %ymm0,%ymm15,%ymm15
  135. vpshufb %ymm2,%ymm15,%ymm15
  136. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  137. vpaddd %ymm12,%ymm8,%ymm8
  138. vpxor %ymm8,%ymm4,%ymm4
  139. vpslld $7,%ymm4,%ymm0
  140. vpsrld $25,%ymm4,%ymm4
  141. vpor %ymm0,%ymm4,%ymm4
  142. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  143. vpaddd %ymm13,%ymm9,%ymm9
  144. vpxor %ymm9,%ymm5,%ymm5
  145. vpslld $7,%ymm5,%ymm0
  146. vpsrld $25,%ymm5,%ymm5
  147. vpor %ymm0,%ymm5,%ymm5
  148. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  149. vpaddd %ymm14,%ymm10,%ymm10
  150. vpxor %ymm10,%ymm6,%ymm6
  151. vpslld $7,%ymm6,%ymm0
  152. vpsrld $25,%ymm6,%ymm6
  153. vpor %ymm0,%ymm6,%ymm6
  154. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  155. vpaddd %ymm15,%ymm11,%ymm11
  156. vpxor %ymm11,%ymm7,%ymm7
  157. vpslld $7,%ymm7,%ymm0
  158. vpsrld $25,%ymm7,%ymm7
  159. vpor %ymm0,%ymm7,%ymm7
  160. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  161. vpaddd 0x00(%rsp),%ymm5,%ymm0
  162. vmovdqa %ymm0,0x00(%rsp)
  163. vpxor %ymm0,%ymm15,%ymm15
  164. vpshufb %ymm3,%ymm15,%ymm15
  165. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
  166. vpaddd 0x20(%rsp),%ymm6,%ymm0
  167. vmovdqa %ymm0,0x20(%rsp)
  168. vpxor %ymm0,%ymm12,%ymm12
  169. vpshufb %ymm3,%ymm12,%ymm12
  170. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  171. vpaddd 0x40(%rsp),%ymm7,%ymm0
  172. vmovdqa %ymm0,0x40(%rsp)
  173. vpxor %ymm0,%ymm13,%ymm13
  174. vpshufb %ymm3,%ymm13,%ymm13
  175. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  176. vpaddd 0x60(%rsp),%ymm4,%ymm0
  177. vmovdqa %ymm0,0x60(%rsp)
  178. vpxor %ymm0,%ymm14,%ymm14
  179. vpshufb %ymm3,%ymm14,%ymm14
  180. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  181. vpaddd %ymm15,%ymm10,%ymm10
  182. vpxor %ymm10,%ymm5,%ymm5
  183. vpslld $12,%ymm5,%ymm0
  184. vpsrld $20,%ymm5,%ymm5
  185. vpor %ymm0,%ymm5,%ymm5
  186. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  187. vpaddd %ymm12,%ymm11,%ymm11
  188. vpxor %ymm11,%ymm6,%ymm6
  189. vpslld $12,%ymm6,%ymm0
  190. vpsrld $20,%ymm6,%ymm6
  191. vpor %ymm0,%ymm6,%ymm6
  192. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  193. vpaddd %ymm13,%ymm8,%ymm8
  194. vpxor %ymm8,%ymm7,%ymm7
  195. vpslld $12,%ymm7,%ymm0
  196. vpsrld $20,%ymm7,%ymm7
  197. vpor %ymm0,%ymm7,%ymm7
  198. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  199. vpaddd %ymm14,%ymm9,%ymm9
  200. vpxor %ymm9,%ymm4,%ymm4
  201. vpslld $12,%ymm4,%ymm0
  202. vpsrld $20,%ymm4,%ymm4
  203. vpor %ymm0,%ymm4,%ymm4
  204. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  205. vpaddd 0x00(%rsp),%ymm5,%ymm0
  206. vmovdqa %ymm0,0x00(%rsp)
  207. vpxor %ymm0,%ymm15,%ymm15
  208. vpshufb %ymm2,%ymm15,%ymm15
  209. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  210. vpaddd 0x20(%rsp),%ymm6,%ymm0
  211. vmovdqa %ymm0,0x20(%rsp)
  212. vpxor %ymm0,%ymm12,%ymm12
  213. vpshufb %ymm2,%ymm12,%ymm12
  214. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  215. vpaddd 0x40(%rsp),%ymm7,%ymm0
  216. vmovdqa %ymm0,0x40(%rsp)
  217. vpxor %ymm0,%ymm13,%ymm13
  218. vpshufb %ymm2,%ymm13,%ymm13
  219. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  220. vpaddd 0x60(%rsp),%ymm4,%ymm0
  221. vmovdqa %ymm0,0x60(%rsp)
  222. vpxor %ymm0,%ymm14,%ymm14
  223. vpshufb %ymm2,%ymm14,%ymm14
  224. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  225. vpaddd %ymm15,%ymm10,%ymm10
  226. vpxor %ymm10,%ymm5,%ymm5
  227. vpslld $7,%ymm5,%ymm0
  228. vpsrld $25,%ymm5,%ymm5
  229. vpor %ymm0,%ymm5,%ymm5
  230. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  231. vpaddd %ymm12,%ymm11,%ymm11
  232. vpxor %ymm11,%ymm6,%ymm6
  233. vpslld $7,%ymm6,%ymm0
  234. vpsrld $25,%ymm6,%ymm6
  235. vpor %ymm0,%ymm6,%ymm6
  236. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  237. vpaddd %ymm13,%ymm8,%ymm8
  238. vpxor %ymm8,%ymm7,%ymm7
  239. vpslld $7,%ymm7,%ymm0
  240. vpsrld $25,%ymm7,%ymm7
  241. vpor %ymm0,%ymm7,%ymm7
  242. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  243. vpaddd %ymm14,%ymm9,%ymm9
  244. vpxor %ymm9,%ymm4,%ymm4
  245. vpslld $7,%ymm4,%ymm0
  246. vpsrld $25,%ymm4,%ymm4
  247. vpor %ymm0,%ymm4,%ymm4
  248. dec %ecx
  249. jnz .Ldoubleround8
  250. # x0..15[0-3] += s[0..15]
  251. vpbroadcastd 0x00(%rdi),%ymm0
  252. vpaddd 0x00(%rsp),%ymm0,%ymm0
  253. vmovdqa %ymm0,0x00(%rsp)
  254. vpbroadcastd 0x04(%rdi),%ymm0
  255. vpaddd 0x20(%rsp),%ymm0,%ymm0
  256. vmovdqa %ymm0,0x20(%rsp)
  257. vpbroadcastd 0x08(%rdi),%ymm0
  258. vpaddd 0x40(%rsp),%ymm0,%ymm0
  259. vmovdqa %ymm0,0x40(%rsp)
  260. vpbroadcastd 0x0c(%rdi),%ymm0
  261. vpaddd 0x60(%rsp),%ymm0,%ymm0
  262. vmovdqa %ymm0,0x60(%rsp)
  263. vpbroadcastd 0x10(%rdi),%ymm0
  264. vpaddd %ymm0,%ymm4,%ymm4
  265. vpbroadcastd 0x14(%rdi),%ymm0
  266. vpaddd %ymm0,%ymm5,%ymm5
  267. vpbroadcastd 0x18(%rdi),%ymm0
  268. vpaddd %ymm0,%ymm6,%ymm6
  269. vpbroadcastd 0x1c(%rdi),%ymm0
  270. vpaddd %ymm0,%ymm7,%ymm7
  271. vpbroadcastd 0x20(%rdi),%ymm0
  272. vpaddd %ymm0,%ymm8,%ymm8
  273. vpbroadcastd 0x24(%rdi),%ymm0
  274. vpaddd %ymm0,%ymm9,%ymm9
  275. vpbroadcastd 0x28(%rdi),%ymm0
  276. vpaddd %ymm0,%ymm10,%ymm10
  277. vpbroadcastd 0x2c(%rdi),%ymm0
  278. vpaddd %ymm0,%ymm11,%ymm11
  279. vpbroadcastd 0x30(%rdi),%ymm0
  280. vpaddd %ymm0,%ymm12,%ymm12
  281. vpbroadcastd 0x34(%rdi),%ymm0
  282. vpaddd %ymm0,%ymm13,%ymm13
  283. vpbroadcastd 0x38(%rdi),%ymm0
  284. vpaddd %ymm0,%ymm14,%ymm14
  285. vpbroadcastd 0x3c(%rdi),%ymm0
  286. vpaddd %ymm0,%ymm15,%ymm15
  287. # x12 += counter values 0-3
  288. vpaddd %ymm1,%ymm12,%ymm12
  289. # interleave 32-bit words in state n, n+1
  290. vmovdqa 0x00(%rsp),%ymm0
  291. vmovdqa 0x20(%rsp),%ymm1
  292. vpunpckldq %ymm1,%ymm0,%ymm2
  293. vpunpckhdq %ymm1,%ymm0,%ymm1
  294. vmovdqa %ymm2,0x00(%rsp)
  295. vmovdqa %ymm1,0x20(%rsp)
  296. vmovdqa 0x40(%rsp),%ymm0
  297. vmovdqa 0x60(%rsp),%ymm1
  298. vpunpckldq %ymm1,%ymm0,%ymm2
  299. vpunpckhdq %ymm1,%ymm0,%ymm1
  300. vmovdqa %ymm2,0x40(%rsp)
  301. vmovdqa %ymm1,0x60(%rsp)
  302. vmovdqa %ymm4,%ymm0
  303. vpunpckldq %ymm5,%ymm0,%ymm4
  304. vpunpckhdq %ymm5,%ymm0,%ymm5
  305. vmovdqa %ymm6,%ymm0
  306. vpunpckldq %ymm7,%ymm0,%ymm6
  307. vpunpckhdq %ymm7,%ymm0,%ymm7
  308. vmovdqa %ymm8,%ymm0
  309. vpunpckldq %ymm9,%ymm0,%ymm8
  310. vpunpckhdq %ymm9,%ymm0,%ymm9
  311. vmovdqa %ymm10,%ymm0
  312. vpunpckldq %ymm11,%ymm0,%ymm10
  313. vpunpckhdq %ymm11,%ymm0,%ymm11
  314. vmovdqa %ymm12,%ymm0
  315. vpunpckldq %ymm13,%ymm0,%ymm12
  316. vpunpckhdq %ymm13,%ymm0,%ymm13
  317. vmovdqa %ymm14,%ymm0
  318. vpunpckldq %ymm15,%ymm0,%ymm14
  319. vpunpckhdq %ymm15,%ymm0,%ymm15
  320. # interleave 64-bit words in state n, n+2
  321. vmovdqa 0x00(%rsp),%ymm0
  322. vmovdqa 0x40(%rsp),%ymm2
  323. vpunpcklqdq %ymm2,%ymm0,%ymm1
  324. vpunpckhqdq %ymm2,%ymm0,%ymm2
  325. vmovdqa %ymm1,0x00(%rsp)
  326. vmovdqa %ymm2,0x40(%rsp)
  327. vmovdqa 0x20(%rsp),%ymm0
  328. vmovdqa 0x60(%rsp),%ymm2
  329. vpunpcklqdq %ymm2,%ymm0,%ymm1
  330. vpunpckhqdq %ymm2,%ymm0,%ymm2
  331. vmovdqa %ymm1,0x20(%rsp)
  332. vmovdqa %ymm2,0x60(%rsp)
  333. vmovdqa %ymm4,%ymm0
  334. vpunpcklqdq %ymm6,%ymm0,%ymm4
  335. vpunpckhqdq %ymm6,%ymm0,%ymm6
  336. vmovdqa %ymm5,%ymm0
  337. vpunpcklqdq %ymm7,%ymm0,%ymm5
  338. vpunpckhqdq %ymm7,%ymm0,%ymm7
  339. vmovdqa %ymm8,%ymm0
  340. vpunpcklqdq %ymm10,%ymm0,%ymm8
  341. vpunpckhqdq %ymm10,%ymm0,%ymm10
  342. vmovdqa %ymm9,%ymm0
  343. vpunpcklqdq %ymm11,%ymm0,%ymm9
  344. vpunpckhqdq %ymm11,%ymm0,%ymm11
  345. vmovdqa %ymm12,%ymm0
  346. vpunpcklqdq %ymm14,%ymm0,%ymm12
  347. vpunpckhqdq %ymm14,%ymm0,%ymm14
  348. vmovdqa %ymm13,%ymm0
  349. vpunpcklqdq %ymm15,%ymm0,%ymm13
  350. vpunpckhqdq %ymm15,%ymm0,%ymm15
  351. # interleave 128-bit words in state n, n+4
  352. vmovdqa 0x00(%rsp),%ymm0
  353. vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
  354. vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
  355. vmovdqa %ymm1,0x00(%rsp)
  356. vmovdqa 0x20(%rsp),%ymm0
  357. vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
  358. vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
  359. vmovdqa %ymm1,0x20(%rsp)
  360. vmovdqa 0x40(%rsp),%ymm0
  361. vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
  362. vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
  363. vmovdqa %ymm1,0x40(%rsp)
  364. vmovdqa 0x60(%rsp),%ymm0
  365. vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
  366. vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
  367. vmovdqa %ymm1,0x60(%rsp)
  368. vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
  369. vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
  370. vmovdqa %ymm0,%ymm8
  371. vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
  372. vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
  373. vmovdqa %ymm0,%ymm9
  374. vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
  375. vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
  376. vmovdqa %ymm0,%ymm10
  377. vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
  378. vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
  379. vmovdqa %ymm0,%ymm11
  380. # xor with corresponding input, write to output
  381. vmovdqa 0x00(%rsp),%ymm0
  382. vpxor 0x0000(%rdx),%ymm0,%ymm0
  383. vmovdqu %ymm0,0x0000(%rsi)
  384. vmovdqa 0x20(%rsp),%ymm0
  385. vpxor 0x0080(%rdx),%ymm0,%ymm0
  386. vmovdqu %ymm0,0x0080(%rsi)
  387. vmovdqa 0x40(%rsp),%ymm0
  388. vpxor 0x0040(%rdx),%ymm0,%ymm0
  389. vmovdqu %ymm0,0x0040(%rsi)
  390. vmovdqa 0x60(%rsp),%ymm0
  391. vpxor 0x00c0(%rdx),%ymm0,%ymm0
  392. vmovdqu %ymm0,0x00c0(%rsi)
  393. vpxor 0x0100(%rdx),%ymm4,%ymm4
  394. vmovdqu %ymm4,0x0100(%rsi)
  395. vpxor 0x0180(%rdx),%ymm5,%ymm5
  396. vmovdqu %ymm5,0x00180(%rsi)
  397. vpxor 0x0140(%rdx),%ymm6,%ymm6
  398. vmovdqu %ymm6,0x0140(%rsi)
  399. vpxor 0x01c0(%rdx),%ymm7,%ymm7
  400. vmovdqu %ymm7,0x01c0(%rsi)
  401. vpxor 0x0020(%rdx),%ymm8,%ymm8
  402. vmovdqu %ymm8,0x0020(%rsi)
  403. vpxor 0x00a0(%rdx),%ymm9,%ymm9
  404. vmovdqu %ymm9,0x00a0(%rsi)
  405. vpxor 0x0060(%rdx),%ymm10,%ymm10
  406. vmovdqu %ymm10,0x0060(%rsi)
  407. vpxor 0x00e0(%rdx),%ymm11,%ymm11
  408. vmovdqu %ymm11,0x00e0(%rsi)
  409. vpxor 0x0120(%rdx),%ymm12,%ymm12
  410. vmovdqu %ymm12,0x0120(%rsi)
  411. vpxor 0x01a0(%rdx),%ymm13,%ymm13
  412. vmovdqu %ymm13,0x01a0(%rsi)
  413. vpxor 0x0160(%rdx),%ymm14,%ymm14
  414. vmovdqu %ymm14,0x0160(%rsi)
  415. vpxor 0x01e0(%rdx),%ymm15,%ymm15
  416. vmovdqu %ymm15,0x01e0(%rsi)
  417. vzeroupper
  418. lea -8(%r10),%rsp
  419. ret
  420. ENDPROC(chacha20_8block_xor_avx2)