chacha-neon-core.S 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805
  1. /*
  2. * ChaCha/XChaCha NEON helper functions
  3. *
  4. * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. *
  10. * Originally based on:
  11. * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  12. *
  13. * Copyright (C) 2015 Martin Willi
  14. *
  15. * This program is free software; you can redistribute it and/or modify
  16. * it under the terms of the GNU General Public License as published by
  17. * the Free Software Foundation; either version 2 of the License, or
  18. * (at your option) any later version.
  19. */
  20. #include <linux/linkage.h>
  21. #include <asm/assembler.h>
  22. #include <asm/cache.h>
  23. .text
  24. .align 6
  25. /*
  26. * chacha_permute - permute one block
  27. *
  28. * Permute one 64-byte block where the state matrix is stored in the four NEON
  29. * registers v0-v3. It performs matrix operations on four words in parallel,
  30. * but requires shuffling to rearrange the words after each round.
  31. *
  32. * The round count is given in w3.
  33. *
  34. * Clobbers: w3, x10, v4, v12
  35. */
  36. SYM_FUNC_START_LOCAL(chacha_permute)
  37. adr_l x10, ROT8
  38. ld1 {v12.4s}, [x10]
  39. .Ldoubleround:
  40. // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  41. add v0.4s, v0.4s, v1.4s
  42. eor v3.16b, v3.16b, v0.16b
  43. rev32 v3.8h, v3.8h
  44. // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  45. add v2.4s, v2.4s, v3.4s
  46. eor v4.16b, v1.16b, v2.16b
  47. shl v1.4s, v4.4s, #12
  48. sri v1.4s, v4.4s, #20
  49. // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  50. add v0.4s, v0.4s, v1.4s
  51. eor v3.16b, v3.16b, v0.16b
  52. tbl v3.16b, {v3.16b}, v12.16b
  53. // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  54. add v2.4s, v2.4s, v3.4s
  55. eor v4.16b, v1.16b, v2.16b
  56. shl v1.4s, v4.4s, #7
  57. sri v1.4s, v4.4s, #25
  58. // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  59. ext v1.16b, v1.16b, v1.16b, #4
  60. // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  61. ext v2.16b, v2.16b, v2.16b, #8
  62. // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  63. ext v3.16b, v3.16b, v3.16b, #12
  64. // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  65. add v0.4s, v0.4s, v1.4s
  66. eor v3.16b, v3.16b, v0.16b
  67. rev32 v3.8h, v3.8h
  68. // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  69. add v2.4s, v2.4s, v3.4s
  70. eor v4.16b, v1.16b, v2.16b
  71. shl v1.4s, v4.4s, #12
  72. sri v1.4s, v4.4s, #20
  73. // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  74. add v0.4s, v0.4s, v1.4s
  75. eor v3.16b, v3.16b, v0.16b
  76. tbl v3.16b, {v3.16b}, v12.16b
  77. // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  78. add v2.4s, v2.4s, v3.4s
  79. eor v4.16b, v1.16b, v2.16b
  80. shl v1.4s, v4.4s, #7
  81. sri v1.4s, v4.4s, #25
  82. // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  83. ext v1.16b, v1.16b, v1.16b, #12
  84. // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  85. ext v2.16b, v2.16b, v2.16b, #8
  86. // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  87. ext v3.16b, v3.16b, v3.16b, #4
  88. subs w3, w3, #2
  89. b.ne .Ldoubleround
  90. ret
  91. SYM_FUNC_END(chacha_permute)
  92. SYM_FUNC_START(chacha_block_xor_neon)
  93. // x0: Input state matrix, s
  94. // x1: 1 data block output, o
  95. // x2: 1 data block input, i
  96. // w3: nrounds
  97. stp x29, x30, [sp, #-16]!
  98. mov x29, sp
  99. // x0..3 = s0..3
  100. ld1 {v0.4s-v3.4s}, [x0]
  101. ld1 {v8.4s-v11.4s}, [x0]
  102. bl chacha_permute
  103. ld1 {v4.16b-v7.16b}, [x2]
  104. // o0 = i0 ^ (x0 + s0)
  105. add v0.4s, v0.4s, v8.4s
  106. eor v0.16b, v0.16b, v4.16b
  107. // o1 = i1 ^ (x1 + s1)
  108. add v1.4s, v1.4s, v9.4s
  109. eor v1.16b, v1.16b, v5.16b
  110. // o2 = i2 ^ (x2 + s2)
  111. add v2.4s, v2.4s, v10.4s
  112. eor v2.16b, v2.16b, v6.16b
  113. // o3 = i3 ^ (x3 + s3)
  114. add v3.4s, v3.4s, v11.4s
  115. eor v3.16b, v3.16b, v7.16b
  116. st1 {v0.16b-v3.16b}, [x1]
  117. ldp x29, x30, [sp], #16
  118. ret
  119. SYM_FUNC_END(chacha_block_xor_neon)
  120. SYM_FUNC_START(hchacha_block_neon)
  121. // x0: Input state matrix, s
  122. // x1: output (8 32-bit words)
  123. // w2: nrounds
  124. stp x29, x30, [sp, #-16]!
  125. mov x29, sp
  126. ld1 {v0.4s-v3.4s}, [x0]
  127. mov w3, w2
  128. bl chacha_permute
  129. st1 {v0.4s}, [x1], #16
  130. st1 {v3.4s}, [x1]
  131. ldp x29, x30, [sp], #16
  132. ret
  133. SYM_FUNC_END(hchacha_block_neon)
  134. a0 .req w12
  135. a1 .req w13
  136. a2 .req w14
  137. a3 .req w15
  138. a4 .req w16
  139. a5 .req w17
  140. a6 .req w19
  141. a7 .req w20
  142. a8 .req w21
  143. a9 .req w22
  144. a10 .req w23
  145. a11 .req w24
  146. a12 .req w25
  147. a13 .req w26
  148. a14 .req w27
  149. a15 .req w28
  150. .align 6
  151. SYM_FUNC_START(chacha_4block_xor_neon)
  152. frame_push 10
  153. // x0: Input state matrix, s
  154. // x1: 4 data blocks output, o
  155. // x2: 4 data blocks input, i
  156. // w3: nrounds
  157. // x4: byte count
  158. adr_l x10, .Lpermute
  159. and x5, x4, #63
  160. add x10, x10, x5
  161. //
  162. // This function encrypts four consecutive ChaCha blocks by loading
  163. // the state matrix in NEON registers four times. The algorithm performs
  164. // each operation on the corresponding word of each state matrix, hence
  165. // requires no word shuffling. For final XORing step we transpose the
  166. // matrix by interleaving 32- and then 64-bit words, which allows us to
  167. // do XOR in NEON registers.
  168. //
  169. // At the same time, a fifth block is encrypted in parallel using
  170. // scalar registers
  171. //
  172. adr_l x9, CTRINC // ... and ROT8
  173. ld1 {v30.4s-v31.4s}, [x9]
  174. // x0..15[0-3] = s0..3[0..3]
  175. add x8, x0, #16
  176. ld4r { v0.4s- v3.4s}, [x0]
  177. ld4r { v4.4s- v7.4s}, [x8], #16
  178. ld4r { v8.4s-v11.4s}, [x8], #16
  179. ld4r {v12.4s-v15.4s}, [x8]
  180. mov a0, v0.s[0]
  181. mov a1, v1.s[0]
  182. mov a2, v2.s[0]
  183. mov a3, v3.s[0]
  184. mov a4, v4.s[0]
  185. mov a5, v5.s[0]
  186. mov a6, v6.s[0]
  187. mov a7, v7.s[0]
  188. mov a8, v8.s[0]
  189. mov a9, v9.s[0]
  190. mov a10, v10.s[0]
  191. mov a11, v11.s[0]
  192. mov a12, v12.s[0]
  193. mov a13, v13.s[0]
  194. mov a14, v14.s[0]
  195. mov a15, v15.s[0]
  196. // x12 += counter values 1-4
  197. add v12.4s, v12.4s, v30.4s
  198. .Ldoubleround4:
  199. // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  200. // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  201. // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  202. // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  203. add v0.4s, v0.4s, v4.4s
  204. add a0, a0, a4
  205. add v1.4s, v1.4s, v5.4s
  206. add a1, a1, a5
  207. add v2.4s, v2.4s, v6.4s
  208. add a2, a2, a6
  209. add v3.4s, v3.4s, v7.4s
  210. add a3, a3, a7
  211. eor v12.16b, v12.16b, v0.16b
  212. eor a12, a12, a0
  213. eor v13.16b, v13.16b, v1.16b
  214. eor a13, a13, a1
  215. eor v14.16b, v14.16b, v2.16b
  216. eor a14, a14, a2
  217. eor v15.16b, v15.16b, v3.16b
  218. eor a15, a15, a3
  219. rev32 v12.8h, v12.8h
  220. ror a12, a12, #16
  221. rev32 v13.8h, v13.8h
  222. ror a13, a13, #16
  223. rev32 v14.8h, v14.8h
  224. ror a14, a14, #16
  225. rev32 v15.8h, v15.8h
  226. ror a15, a15, #16
  227. // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  228. // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  229. // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  230. // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  231. add v8.4s, v8.4s, v12.4s
  232. add a8, a8, a12
  233. add v9.4s, v9.4s, v13.4s
  234. add a9, a9, a13
  235. add v10.4s, v10.4s, v14.4s
  236. add a10, a10, a14
  237. add v11.4s, v11.4s, v15.4s
  238. add a11, a11, a15
  239. eor v16.16b, v4.16b, v8.16b
  240. eor a4, a4, a8
  241. eor v17.16b, v5.16b, v9.16b
  242. eor a5, a5, a9
  243. eor v18.16b, v6.16b, v10.16b
  244. eor a6, a6, a10
  245. eor v19.16b, v7.16b, v11.16b
  246. eor a7, a7, a11
  247. shl v4.4s, v16.4s, #12
  248. shl v5.4s, v17.4s, #12
  249. shl v6.4s, v18.4s, #12
  250. shl v7.4s, v19.4s, #12
  251. sri v4.4s, v16.4s, #20
  252. ror a4, a4, #20
  253. sri v5.4s, v17.4s, #20
  254. ror a5, a5, #20
  255. sri v6.4s, v18.4s, #20
  256. ror a6, a6, #20
  257. sri v7.4s, v19.4s, #20
  258. ror a7, a7, #20
  259. // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  260. // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  261. // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  262. // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  263. add v0.4s, v0.4s, v4.4s
  264. add a0, a0, a4
  265. add v1.4s, v1.4s, v5.4s
  266. add a1, a1, a5
  267. add v2.4s, v2.4s, v6.4s
  268. add a2, a2, a6
  269. add v3.4s, v3.4s, v7.4s
  270. add a3, a3, a7
  271. eor v12.16b, v12.16b, v0.16b
  272. eor a12, a12, a0
  273. eor v13.16b, v13.16b, v1.16b
  274. eor a13, a13, a1
  275. eor v14.16b, v14.16b, v2.16b
  276. eor a14, a14, a2
  277. eor v15.16b, v15.16b, v3.16b
  278. eor a15, a15, a3
  279. tbl v12.16b, {v12.16b}, v31.16b
  280. ror a12, a12, #24
  281. tbl v13.16b, {v13.16b}, v31.16b
  282. ror a13, a13, #24
  283. tbl v14.16b, {v14.16b}, v31.16b
  284. ror a14, a14, #24
  285. tbl v15.16b, {v15.16b}, v31.16b
  286. ror a15, a15, #24
  287. // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  288. // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  289. // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  290. // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  291. add v8.4s, v8.4s, v12.4s
  292. add a8, a8, a12
  293. add v9.4s, v9.4s, v13.4s
  294. add a9, a9, a13
  295. add v10.4s, v10.4s, v14.4s
  296. add a10, a10, a14
  297. add v11.4s, v11.4s, v15.4s
  298. add a11, a11, a15
  299. eor v16.16b, v4.16b, v8.16b
  300. eor a4, a4, a8
  301. eor v17.16b, v5.16b, v9.16b
  302. eor a5, a5, a9
  303. eor v18.16b, v6.16b, v10.16b
  304. eor a6, a6, a10
  305. eor v19.16b, v7.16b, v11.16b
  306. eor a7, a7, a11
  307. shl v4.4s, v16.4s, #7
  308. shl v5.4s, v17.4s, #7
  309. shl v6.4s, v18.4s, #7
  310. shl v7.4s, v19.4s, #7
  311. sri v4.4s, v16.4s, #25
  312. ror a4, a4, #25
  313. sri v5.4s, v17.4s, #25
  314. ror a5, a5, #25
  315. sri v6.4s, v18.4s, #25
  316. ror a6, a6, #25
  317. sri v7.4s, v19.4s, #25
  318. ror a7, a7, #25
  319. // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  320. // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
  321. // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  322. // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  323. add v0.4s, v0.4s, v5.4s
  324. add a0, a0, a5
  325. add v1.4s, v1.4s, v6.4s
  326. add a1, a1, a6
  327. add v2.4s, v2.4s, v7.4s
  328. add a2, a2, a7
  329. add v3.4s, v3.4s, v4.4s
  330. add a3, a3, a4
  331. eor v15.16b, v15.16b, v0.16b
  332. eor a15, a15, a0
  333. eor v12.16b, v12.16b, v1.16b
  334. eor a12, a12, a1
  335. eor v13.16b, v13.16b, v2.16b
  336. eor a13, a13, a2
  337. eor v14.16b, v14.16b, v3.16b
  338. eor a14, a14, a3
  339. rev32 v15.8h, v15.8h
  340. ror a15, a15, #16
  341. rev32 v12.8h, v12.8h
  342. ror a12, a12, #16
  343. rev32 v13.8h, v13.8h
  344. ror a13, a13, #16
  345. rev32 v14.8h, v14.8h
  346. ror a14, a14, #16
  347. // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  348. // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  349. // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  350. // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  351. add v10.4s, v10.4s, v15.4s
  352. add a10, a10, a15
  353. add v11.4s, v11.4s, v12.4s
  354. add a11, a11, a12
  355. add v8.4s, v8.4s, v13.4s
  356. add a8, a8, a13
  357. add v9.4s, v9.4s, v14.4s
  358. add a9, a9, a14
  359. eor v16.16b, v5.16b, v10.16b
  360. eor a5, a5, a10
  361. eor v17.16b, v6.16b, v11.16b
  362. eor a6, a6, a11
  363. eor v18.16b, v7.16b, v8.16b
  364. eor a7, a7, a8
  365. eor v19.16b, v4.16b, v9.16b
  366. eor a4, a4, a9
  367. shl v5.4s, v16.4s, #12
  368. shl v6.4s, v17.4s, #12
  369. shl v7.4s, v18.4s, #12
  370. shl v4.4s, v19.4s, #12
  371. sri v5.4s, v16.4s, #20
  372. ror a5, a5, #20
  373. sri v6.4s, v17.4s, #20
  374. ror a6, a6, #20
  375. sri v7.4s, v18.4s, #20
  376. ror a7, a7, #20
  377. sri v4.4s, v19.4s, #20
  378. ror a4, a4, #20
  379. // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  380. // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  381. // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  382. // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  383. add v0.4s, v0.4s, v5.4s
  384. add a0, a0, a5
  385. add v1.4s, v1.4s, v6.4s
  386. add a1, a1, a6
  387. add v2.4s, v2.4s, v7.4s
  388. add a2, a2, a7
  389. add v3.4s, v3.4s, v4.4s
  390. add a3, a3, a4
  391. eor v15.16b, v15.16b, v0.16b
  392. eor a15, a15, a0
  393. eor v12.16b, v12.16b, v1.16b
  394. eor a12, a12, a1
  395. eor v13.16b, v13.16b, v2.16b
  396. eor a13, a13, a2
  397. eor v14.16b, v14.16b, v3.16b
  398. eor a14, a14, a3
  399. tbl v15.16b, {v15.16b}, v31.16b
  400. ror a15, a15, #24
  401. tbl v12.16b, {v12.16b}, v31.16b
  402. ror a12, a12, #24
  403. tbl v13.16b, {v13.16b}, v31.16b
  404. ror a13, a13, #24
  405. tbl v14.16b, {v14.16b}, v31.16b
  406. ror a14, a14, #24
  407. // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  408. // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  409. // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  410. // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  411. add v10.4s, v10.4s, v15.4s
  412. add a10, a10, a15
  413. add v11.4s, v11.4s, v12.4s
  414. add a11, a11, a12
  415. add v8.4s, v8.4s, v13.4s
  416. add a8, a8, a13
  417. add v9.4s, v9.4s, v14.4s
  418. add a9, a9, a14
  419. eor v16.16b, v5.16b, v10.16b
  420. eor a5, a5, a10
  421. eor v17.16b, v6.16b, v11.16b
  422. eor a6, a6, a11
  423. eor v18.16b, v7.16b, v8.16b
  424. eor a7, a7, a8
  425. eor v19.16b, v4.16b, v9.16b
  426. eor a4, a4, a9
  427. shl v5.4s, v16.4s, #7
  428. shl v6.4s, v17.4s, #7
  429. shl v7.4s, v18.4s, #7
  430. shl v4.4s, v19.4s, #7
  431. sri v5.4s, v16.4s, #25
  432. ror a5, a5, #25
  433. sri v6.4s, v17.4s, #25
  434. ror a6, a6, #25
  435. sri v7.4s, v18.4s, #25
  436. ror a7, a7, #25
  437. sri v4.4s, v19.4s, #25
  438. ror a4, a4, #25
  439. subs w3, w3, #2
  440. b.ne .Ldoubleround4
  441. ld4r {v16.4s-v19.4s}, [x0], #16
  442. ld4r {v20.4s-v23.4s}, [x0], #16
  443. // x12 += counter values 0-3
  444. add v12.4s, v12.4s, v30.4s
  445. // x0[0-3] += s0[0]
  446. // x1[0-3] += s0[1]
  447. // x2[0-3] += s0[2]
  448. // x3[0-3] += s0[3]
  449. add v0.4s, v0.4s, v16.4s
  450. mov w6, v16.s[0]
  451. mov w7, v17.s[0]
  452. add v1.4s, v1.4s, v17.4s
  453. mov w8, v18.s[0]
  454. mov w9, v19.s[0]
  455. add v2.4s, v2.4s, v18.4s
  456. add a0, a0, w6
  457. add a1, a1, w7
  458. add v3.4s, v3.4s, v19.4s
  459. add a2, a2, w8
  460. add a3, a3, w9
  461. CPU_BE( rev a0, a0 )
  462. CPU_BE( rev a1, a1 )
  463. CPU_BE( rev a2, a2 )
  464. CPU_BE( rev a3, a3 )
  465. ld4r {v24.4s-v27.4s}, [x0], #16
  466. ld4r {v28.4s-v31.4s}, [x0]
  467. // x4[0-3] += s1[0]
  468. // x5[0-3] += s1[1]
  469. // x6[0-3] += s1[2]
  470. // x7[0-3] += s1[3]
  471. add v4.4s, v4.4s, v20.4s
  472. mov w6, v20.s[0]
  473. mov w7, v21.s[0]
  474. add v5.4s, v5.4s, v21.4s
  475. mov w8, v22.s[0]
  476. mov w9, v23.s[0]
  477. add v6.4s, v6.4s, v22.4s
  478. add a4, a4, w6
  479. add a5, a5, w7
  480. add v7.4s, v7.4s, v23.4s
  481. add a6, a6, w8
  482. add a7, a7, w9
  483. CPU_BE( rev a4, a4 )
  484. CPU_BE( rev a5, a5 )
  485. CPU_BE( rev a6, a6 )
  486. CPU_BE( rev a7, a7 )
  487. // x8[0-3] += s2[0]
  488. // x9[0-3] += s2[1]
  489. // x10[0-3] += s2[2]
  490. // x11[0-3] += s2[3]
  491. add v8.4s, v8.4s, v24.4s
  492. mov w6, v24.s[0]
  493. mov w7, v25.s[0]
  494. add v9.4s, v9.4s, v25.4s
  495. mov w8, v26.s[0]
  496. mov w9, v27.s[0]
  497. add v10.4s, v10.4s, v26.4s
  498. add a8, a8, w6
  499. add a9, a9, w7
  500. add v11.4s, v11.4s, v27.4s
  501. add a10, a10, w8
  502. add a11, a11, w9
  503. CPU_BE( rev a8, a8 )
  504. CPU_BE( rev a9, a9 )
  505. CPU_BE( rev a10, a10 )
  506. CPU_BE( rev a11, a11 )
  507. // x12[0-3] += s3[0]
  508. // x13[0-3] += s3[1]
  509. // x14[0-3] += s3[2]
  510. // x15[0-3] += s3[3]
  511. add v12.4s, v12.4s, v28.4s
  512. mov w6, v28.s[0]
  513. mov w7, v29.s[0]
  514. add v13.4s, v13.4s, v29.4s
  515. mov w8, v30.s[0]
  516. mov w9, v31.s[0]
  517. add v14.4s, v14.4s, v30.4s
  518. add a12, a12, w6
  519. add a13, a13, w7
  520. add v15.4s, v15.4s, v31.4s
  521. add a14, a14, w8
  522. add a15, a15, w9
  523. CPU_BE( rev a12, a12 )
  524. CPU_BE( rev a13, a13 )
  525. CPU_BE( rev a14, a14 )
  526. CPU_BE( rev a15, a15 )
  527. // interleave 32-bit words in state n, n+1
  528. ldp w6, w7, [x2], #64
  529. zip1 v16.4s, v0.4s, v1.4s
  530. ldp w8, w9, [x2, #-56]
  531. eor a0, a0, w6
  532. zip2 v17.4s, v0.4s, v1.4s
  533. eor a1, a1, w7
  534. zip1 v18.4s, v2.4s, v3.4s
  535. eor a2, a2, w8
  536. zip2 v19.4s, v2.4s, v3.4s
  537. eor a3, a3, w9
  538. ldp w6, w7, [x2, #-48]
  539. zip1 v20.4s, v4.4s, v5.4s
  540. ldp w8, w9, [x2, #-40]
  541. eor a4, a4, w6
  542. zip2 v21.4s, v4.4s, v5.4s
  543. eor a5, a5, w7
  544. zip1 v22.4s, v6.4s, v7.4s
  545. eor a6, a6, w8
  546. zip2 v23.4s, v6.4s, v7.4s
  547. eor a7, a7, w9
  548. ldp w6, w7, [x2, #-32]
  549. zip1 v24.4s, v8.4s, v9.4s
  550. ldp w8, w9, [x2, #-24]
  551. eor a8, a8, w6
  552. zip2 v25.4s, v8.4s, v9.4s
  553. eor a9, a9, w7
  554. zip1 v26.4s, v10.4s, v11.4s
  555. eor a10, a10, w8
  556. zip2 v27.4s, v10.4s, v11.4s
  557. eor a11, a11, w9
  558. ldp w6, w7, [x2, #-16]
  559. zip1 v28.4s, v12.4s, v13.4s
  560. ldp w8, w9, [x2, #-8]
  561. eor a12, a12, w6
  562. zip2 v29.4s, v12.4s, v13.4s
  563. eor a13, a13, w7
  564. zip1 v30.4s, v14.4s, v15.4s
  565. eor a14, a14, w8
  566. zip2 v31.4s, v14.4s, v15.4s
  567. eor a15, a15, w9
  568. add x3, x2, x4
  569. sub x3, x3, #128 // start of last block
  570. subs x5, x4, #128
  571. csel x2, x2, x3, ge
  572. // interleave 64-bit words in state n, n+2
  573. zip1 v0.2d, v16.2d, v18.2d
  574. zip2 v4.2d, v16.2d, v18.2d
  575. stp a0, a1, [x1], #64
  576. zip1 v8.2d, v17.2d, v19.2d
  577. zip2 v12.2d, v17.2d, v19.2d
  578. stp a2, a3, [x1, #-56]
  579. subs x6, x4, #192
  580. ld1 {v16.16b-v19.16b}, [x2], #64
  581. csel x2, x2, x3, ge
  582. zip1 v1.2d, v20.2d, v22.2d
  583. zip2 v5.2d, v20.2d, v22.2d
  584. stp a4, a5, [x1, #-48]
  585. zip1 v9.2d, v21.2d, v23.2d
  586. zip2 v13.2d, v21.2d, v23.2d
  587. stp a6, a7, [x1, #-40]
  588. subs x7, x4, #256
  589. ld1 {v20.16b-v23.16b}, [x2], #64
  590. csel x2, x2, x3, ge
  591. zip1 v2.2d, v24.2d, v26.2d
  592. zip2 v6.2d, v24.2d, v26.2d
  593. stp a8, a9, [x1, #-32]
  594. zip1 v10.2d, v25.2d, v27.2d
  595. zip2 v14.2d, v25.2d, v27.2d
  596. stp a10, a11, [x1, #-24]
  597. subs x8, x4, #320
  598. ld1 {v24.16b-v27.16b}, [x2], #64
  599. csel x2, x2, x3, ge
  600. zip1 v3.2d, v28.2d, v30.2d
  601. zip2 v7.2d, v28.2d, v30.2d
  602. stp a12, a13, [x1, #-16]
  603. zip1 v11.2d, v29.2d, v31.2d
  604. zip2 v15.2d, v29.2d, v31.2d
  605. stp a14, a15, [x1, #-8]
  606. tbnz x5, #63, .Lt128
  607. ld1 {v28.16b-v31.16b}, [x2]
  608. // xor with corresponding input, write to output
  609. eor v16.16b, v16.16b, v0.16b
  610. eor v17.16b, v17.16b, v1.16b
  611. eor v18.16b, v18.16b, v2.16b
  612. eor v19.16b, v19.16b, v3.16b
  613. tbnz x6, #63, .Lt192
  614. eor v20.16b, v20.16b, v4.16b
  615. eor v21.16b, v21.16b, v5.16b
  616. eor v22.16b, v22.16b, v6.16b
  617. eor v23.16b, v23.16b, v7.16b
  618. st1 {v16.16b-v19.16b}, [x1], #64
  619. tbnz x7, #63, .Lt256
  620. eor v24.16b, v24.16b, v8.16b
  621. eor v25.16b, v25.16b, v9.16b
  622. eor v26.16b, v26.16b, v10.16b
  623. eor v27.16b, v27.16b, v11.16b
  624. st1 {v20.16b-v23.16b}, [x1], #64
  625. tbnz x8, #63, .Lt320
  626. eor v28.16b, v28.16b, v12.16b
  627. eor v29.16b, v29.16b, v13.16b
  628. eor v30.16b, v30.16b, v14.16b
  629. eor v31.16b, v31.16b, v15.16b
  630. st1 {v24.16b-v27.16b}, [x1], #64
  631. st1 {v28.16b-v31.16b}, [x1]
  632. .Lout: frame_pop
  633. ret
  634. // fewer than 192 bytes of in/output
  635. .Lt192: cbz x5, 1f // exactly 128 bytes?
  636. ld1 {v28.16b-v31.16b}, [x10]
  637. add x5, x5, x1
  638. tbl v28.16b, {v4.16b-v7.16b}, v28.16b
  639. tbl v29.16b, {v4.16b-v7.16b}, v29.16b
  640. tbl v30.16b, {v4.16b-v7.16b}, v30.16b
  641. tbl v31.16b, {v4.16b-v7.16b}, v31.16b
  642. 0: eor v20.16b, v20.16b, v28.16b
  643. eor v21.16b, v21.16b, v29.16b
  644. eor v22.16b, v22.16b, v30.16b
  645. eor v23.16b, v23.16b, v31.16b
  646. st1 {v20.16b-v23.16b}, [x5] // overlapping stores
  647. 1: st1 {v16.16b-v19.16b}, [x1]
  648. b .Lout
  649. // fewer than 128 bytes of in/output
  650. .Lt128: ld1 {v28.16b-v31.16b}, [x10]
  651. add x5, x5, x1
  652. sub x1, x1, #64
  653. tbl v28.16b, {v0.16b-v3.16b}, v28.16b
  654. tbl v29.16b, {v0.16b-v3.16b}, v29.16b
  655. tbl v30.16b, {v0.16b-v3.16b}, v30.16b
  656. tbl v31.16b, {v0.16b-v3.16b}, v31.16b
  657. ld1 {v16.16b-v19.16b}, [x1] // reload first output block
  658. b 0b
  659. // fewer than 256 bytes of in/output
  660. .Lt256: cbz x6, 2f // exactly 192 bytes?
  661. ld1 {v4.16b-v7.16b}, [x10]
  662. add x6, x6, x1
  663. tbl v0.16b, {v8.16b-v11.16b}, v4.16b
  664. tbl v1.16b, {v8.16b-v11.16b}, v5.16b
  665. tbl v2.16b, {v8.16b-v11.16b}, v6.16b
  666. tbl v3.16b, {v8.16b-v11.16b}, v7.16b
  667. eor v28.16b, v28.16b, v0.16b
  668. eor v29.16b, v29.16b, v1.16b
  669. eor v30.16b, v30.16b, v2.16b
  670. eor v31.16b, v31.16b, v3.16b
  671. st1 {v28.16b-v31.16b}, [x6] // overlapping stores
  672. 2: st1 {v20.16b-v23.16b}, [x1]
  673. b .Lout
  674. // fewer than 320 bytes of in/output
  675. .Lt320: cbz x7, 3f // exactly 256 bytes?
  676. ld1 {v4.16b-v7.16b}, [x10]
  677. add x7, x7, x1
  678. tbl v0.16b, {v12.16b-v15.16b}, v4.16b
  679. tbl v1.16b, {v12.16b-v15.16b}, v5.16b
  680. tbl v2.16b, {v12.16b-v15.16b}, v6.16b
  681. tbl v3.16b, {v12.16b-v15.16b}, v7.16b
  682. eor v28.16b, v28.16b, v0.16b
  683. eor v29.16b, v29.16b, v1.16b
  684. eor v30.16b, v30.16b, v2.16b
  685. eor v31.16b, v31.16b, v3.16b
  686. st1 {v28.16b-v31.16b}, [x7] // overlapping stores
  687. 3: st1 {v24.16b-v27.16b}, [x1]
  688. b .Lout
  689. SYM_FUNC_END(chacha_4block_xor_neon)
  690. .section ".rodata", "a", %progbits
  691. .align L1_CACHE_SHIFT
  692. .Lpermute:
  693. .set .Li, 0
  694. .rept 128
  695. .byte (.Li - 64)
  696. .set .Li, .Li + 1
  697. .endr
  698. CTRINC: .word 1, 2, 3, 4
  699. ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f