poly1305-sse2-x86_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. /*
  2. * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
  3. *
  4. * Copyright (C) 2015 Martin Willi
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. */
  11. #include <linux/linkage.h>
  12. .section .rodata.cst16.ANMASK, "aM", @progbits, 16
  13. .align 16
  14. ANMASK: .octa 0x0000000003ffffff0000000003ffffff
  15. .section .rodata.cst16.ORMASK, "aM", @progbits, 16
  16. .align 16
  17. ORMASK: .octa 0x00000000010000000000000001000000
  18. .text
  19. #define h0 0x00(%rdi)
  20. #define h1 0x04(%rdi)
  21. #define h2 0x08(%rdi)
  22. #define h3 0x0c(%rdi)
  23. #define h4 0x10(%rdi)
  24. #define r0 0x00(%rdx)
  25. #define r1 0x04(%rdx)
  26. #define r2 0x08(%rdx)
  27. #define r3 0x0c(%rdx)
  28. #define r4 0x10(%rdx)
  29. #define s1 0x00(%rsp)
  30. #define s2 0x04(%rsp)
  31. #define s3 0x08(%rsp)
  32. #define s4 0x0c(%rsp)
  33. #define m %rsi
  34. #define h01 %xmm0
  35. #define h23 %xmm1
  36. #define h44 %xmm2
  37. #define t1 %xmm3
  38. #define t2 %xmm4
  39. #define t3 %xmm5
  40. #define t4 %xmm6
  41. #define mask %xmm7
  42. #define d0 %r8
  43. #define d1 %r9
  44. #define d2 %r10
  45. #define d3 %r11
  46. #define d4 %r12
  47. ENTRY(poly1305_block_sse2)
  48. # %rdi: Accumulator h[5]
  49. # %rsi: 16 byte input block m
  50. # %rdx: Poly1305 key r[5]
  51. # %rcx: Block count
  52. # This single block variant tries to improve performance by doing two
  53. # multiplications in parallel using SSE instructions. There is quite
  54. # some quardword packing involved, hence the speedup is marginal.
  55. push %rbx
  56. push %r12
  57. sub $0x10,%rsp
  58. # s1..s4 = r1..r4 * 5
  59. mov r1,%eax
  60. lea (%eax,%eax,4),%eax
  61. mov %eax,s1
  62. mov r2,%eax
  63. lea (%eax,%eax,4),%eax
  64. mov %eax,s2
  65. mov r3,%eax
  66. lea (%eax,%eax,4),%eax
  67. mov %eax,s3
  68. mov r4,%eax
  69. lea (%eax,%eax,4),%eax
  70. mov %eax,s4
  71. movdqa ANMASK(%rip),mask
  72. .Ldoblock:
  73. # h01 = [0, h1, 0, h0]
  74. # h23 = [0, h3, 0, h2]
  75. # h44 = [0, h4, 0, h4]
  76. movd h0,h01
  77. movd h1,t1
  78. movd h2,h23
  79. movd h3,t2
  80. movd h4,h44
  81. punpcklqdq t1,h01
  82. punpcklqdq t2,h23
  83. punpcklqdq h44,h44
  84. # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
  85. movd 0x00(m),t1
  86. movd 0x03(m),t2
  87. psrld $2,t2
  88. punpcklqdq t2,t1
  89. pand mask,t1
  90. paddd t1,h01
  91. # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
  92. movd 0x06(m),t1
  93. movd 0x09(m),t2
  94. psrld $4,t1
  95. psrld $6,t2
  96. punpcklqdq t2,t1
  97. pand mask,t1
  98. paddd t1,h23
  99. # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
  100. mov 0x0c(m),%eax
  101. shr $8,%eax
  102. or $0x01000000,%eax
  103. movd %eax,t1
  104. pshufd $0xc4,t1,t1
  105. paddd t1,h44
  106. # t1[0] = h0 * r0 + h2 * s3
  107. # t1[1] = h1 * s4 + h3 * s2
  108. movd r0,t1
  109. movd s4,t2
  110. punpcklqdq t2,t1
  111. pmuludq h01,t1
  112. movd s3,t2
  113. movd s2,t3
  114. punpcklqdq t3,t2
  115. pmuludq h23,t2
  116. paddq t2,t1
  117. # t2[0] = h0 * r1 + h2 * s4
  118. # t2[1] = h1 * r0 + h3 * s3
  119. movd r1,t2
  120. movd r0,t3
  121. punpcklqdq t3,t2
  122. pmuludq h01,t2
  123. movd s4,t3
  124. movd s3,t4
  125. punpcklqdq t4,t3
  126. pmuludq h23,t3
  127. paddq t3,t2
  128. # t3[0] = h4 * s1
  129. # t3[1] = h4 * s2
  130. movd s1,t3
  131. movd s2,t4
  132. punpcklqdq t4,t3
  133. pmuludq h44,t3
  134. # d0 = t1[0] + t1[1] + t3[0]
  135. # d1 = t2[0] + t2[1] + t3[1]
  136. movdqa t1,t4
  137. punpcklqdq t2,t4
  138. punpckhqdq t2,t1
  139. paddq t4,t1
  140. paddq t3,t1
  141. movq t1,d0
  142. psrldq $8,t1
  143. movq t1,d1
  144. # t1[0] = h0 * r2 + h2 * r0
  145. # t1[1] = h1 * r1 + h3 * s4
  146. movd r2,t1
  147. movd r1,t2
  148. punpcklqdq t2,t1
  149. pmuludq h01,t1
  150. movd r0,t2
  151. movd s4,t3
  152. punpcklqdq t3,t2
  153. pmuludq h23,t2
  154. paddq t2,t1
  155. # t2[0] = h0 * r3 + h2 * r1
  156. # t2[1] = h1 * r2 + h3 * r0
  157. movd r3,t2
  158. movd r2,t3
  159. punpcklqdq t3,t2
  160. pmuludq h01,t2
  161. movd r1,t3
  162. movd r0,t4
  163. punpcklqdq t4,t3
  164. pmuludq h23,t3
  165. paddq t3,t2
  166. # t3[0] = h4 * s3
  167. # t3[1] = h4 * s4
  168. movd s3,t3
  169. movd s4,t4
  170. punpcklqdq t4,t3
  171. pmuludq h44,t3
  172. # d2 = t1[0] + t1[1] + t3[0]
  173. # d3 = t2[0] + t2[1] + t3[1]
  174. movdqa t1,t4
  175. punpcklqdq t2,t4
  176. punpckhqdq t2,t1
  177. paddq t4,t1
  178. paddq t3,t1
  179. movq t1,d2
  180. psrldq $8,t1
  181. movq t1,d3
  182. # t1[0] = h0 * r4 + h2 * r2
  183. # t1[1] = h1 * r3 + h3 * r1
  184. movd r4,t1
  185. movd r3,t2
  186. punpcklqdq t2,t1
  187. pmuludq h01,t1
  188. movd r2,t2
  189. movd r1,t3
  190. punpcklqdq t3,t2
  191. pmuludq h23,t2
  192. paddq t2,t1
  193. # t3[0] = h4 * r0
  194. movd r0,t3
  195. pmuludq h44,t3
  196. # d4 = t1[0] + t1[1] + t3[0]
  197. movdqa t1,t4
  198. psrldq $8,t4
  199. paddq t4,t1
  200. paddq t3,t1
  201. movq t1,d4
  202. # d1 += d0 >> 26
  203. mov d0,%rax
  204. shr $26,%rax
  205. add %rax,d1
  206. # h0 = d0 & 0x3ffffff
  207. mov d0,%rbx
  208. and $0x3ffffff,%ebx
  209. # d2 += d1 >> 26
  210. mov d1,%rax
  211. shr $26,%rax
  212. add %rax,d2
  213. # h1 = d1 & 0x3ffffff
  214. mov d1,%rax
  215. and $0x3ffffff,%eax
  216. mov %eax,h1
  217. # d3 += d2 >> 26
  218. mov d2,%rax
  219. shr $26,%rax
  220. add %rax,d3
  221. # h2 = d2 & 0x3ffffff
  222. mov d2,%rax
  223. and $0x3ffffff,%eax
  224. mov %eax,h2
  225. # d4 += d3 >> 26
  226. mov d3,%rax
  227. shr $26,%rax
  228. add %rax,d4
  229. # h3 = d3 & 0x3ffffff
  230. mov d3,%rax
  231. and $0x3ffffff,%eax
  232. mov %eax,h3
  233. # h0 += (d4 >> 26) * 5
  234. mov d4,%rax
  235. shr $26,%rax
  236. lea (%rax,%rax,4),%rax
  237. add %rax,%rbx
  238. # h4 = d4 & 0x3ffffff
  239. mov d4,%rax
  240. and $0x3ffffff,%eax
  241. mov %eax,h4
  242. # h1 += h0 >> 26
  243. mov %rbx,%rax
  244. shr $26,%rax
  245. add %eax,h1
  246. # h0 = h0 & 0x3ffffff
  247. andl $0x3ffffff,%ebx
  248. mov %ebx,h0
  249. add $0x10,m
  250. dec %rcx
  251. jnz .Ldoblock
  252. add $0x10,%rsp
  253. pop %r12
  254. pop %rbx
  255. ret
  256. ENDPROC(poly1305_block_sse2)
  257. #define u0 0x00(%r8)
  258. #define u1 0x04(%r8)
  259. #define u2 0x08(%r8)
  260. #define u3 0x0c(%r8)
  261. #define u4 0x10(%r8)
  262. #define hc0 %xmm0
  263. #define hc1 %xmm1
  264. #define hc2 %xmm2
  265. #define hc3 %xmm5
  266. #define hc4 %xmm6
  267. #define ru0 %xmm7
  268. #define ru1 %xmm8
  269. #define ru2 %xmm9
  270. #define ru3 %xmm10
  271. #define ru4 %xmm11
  272. #define sv1 %xmm12
  273. #define sv2 %xmm13
  274. #define sv3 %xmm14
  275. #define sv4 %xmm15
  276. #undef d0
  277. #define d0 %r13
  278. ENTRY(poly1305_2block_sse2)
  279. # %rdi: Accumulator h[5]
  280. # %rsi: 16 byte input block m
  281. # %rdx: Poly1305 key r[5]
  282. # %rcx: Doubleblock count
  283. # %r8: Poly1305 derived key r^2 u[5]
  284. # This two-block variant further improves performance by using loop
  285. # unrolled block processing. This is more straight forward and does
  286. # less byte shuffling, but requires a second Poly1305 key r^2:
  287. # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
  288. push %rbx
  289. push %r12
  290. push %r13
  291. # combine r0,u0
  292. movd u0,ru0
  293. movd r0,t1
  294. punpcklqdq t1,ru0
  295. # combine r1,u1 and s1=r1*5,v1=u1*5
  296. movd u1,ru1
  297. movd r1,t1
  298. punpcklqdq t1,ru1
  299. movdqa ru1,sv1
  300. pslld $2,sv1
  301. paddd ru1,sv1
  302. # combine r2,u2 and s2=r2*5,v2=u2*5
  303. movd u2,ru2
  304. movd r2,t1
  305. punpcklqdq t1,ru2
  306. movdqa ru2,sv2
  307. pslld $2,sv2
  308. paddd ru2,sv2
  309. # combine r3,u3 and s3=r3*5,v3=u3*5
  310. movd u3,ru3
  311. movd r3,t1
  312. punpcklqdq t1,ru3
  313. movdqa ru3,sv3
  314. pslld $2,sv3
  315. paddd ru3,sv3
  316. # combine r4,u4 and s4=r4*5,v4=u4*5
  317. movd u4,ru4
  318. movd r4,t1
  319. punpcklqdq t1,ru4
  320. movdqa ru4,sv4
  321. pslld $2,sv4
  322. paddd ru4,sv4
  323. .Ldoblock2:
  324. # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
  325. movd 0x00(m),hc0
  326. movd 0x10(m),t1
  327. punpcklqdq t1,hc0
  328. pand ANMASK(%rip),hc0
  329. movd h0,t1
  330. paddd t1,hc0
  331. # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
  332. movd 0x03(m),hc1
  333. movd 0x13(m),t1
  334. punpcklqdq t1,hc1
  335. psrld $2,hc1
  336. pand ANMASK(%rip),hc1
  337. movd h1,t1
  338. paddd t1,hc1
  339. # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
  340. movd 0x06(m),hc2
  341. movd 0x16(m),t1
  342. punpcklqdq t1,hc2
  343. psrld $4,hc2
  344. pand ANMASK(%rip),hc2
  345. movd h2,t1
  346. paddd t1,hc2
  347. # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
  348. movd 0x09(m),hc3
  349. movd 0x19(m),t1
  350. punpcklqdq t1,hc3
  351. psrld $6,hc3
  352. pand ANMASK(%rip),hc3
  353. movd h3,t1
  354. paddd t1,hc3
  355. # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
  356. movd 0x0c(m),hc4
  357. movd 0x1c(m),t1
  358. punpcklqdq t1,hc4
  359. psrld $8,hc4
  360. por ORMASK(%rip),hc4
  361. movd h4,t1
  362. paddd t1,hc4
  363. # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
  364. movdqa ru0,t1
  365. pmuludq hc0,t1
  366. # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
  367. movdqa sv4,t2
  368. pmuludq hc1,t2
  369. paddq t2,t1
  370. # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
  371. movdqa sv3,t2
  372. pmuludq hc2,t2
  373. paddq t2,t1
  374. # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
  375. movdqa sv2,t2
  376. pmuludq hc3,t2
  377. paddq t2,t1
  378. # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
  379. movdqa sv1,t2
  380. pmuludq hc4,t2
  381. paddq t2,t1
  382. # d0 = t1[0] + t1[1]
  383. movdqa t1,t2
  384. psrldq $8,t2
  385. paddq t2,t1
  386. movq t1,d0
  387. # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
  388. movdqa ru1,t1
  389. pmuludq hc0,t1
  390. # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
  391. movdqa ru0,t2
  392. pmuludq hc1,t2
  393. paddq t2,t1
  394. # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
  395. movdqa sv4,t2
  396. pmuludq hc2,t2
  397. paddq t2,t1
  398. # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
  399. movdqa sv3,t2
  400. pmuludq hc3,t2
  401. paddq t2,t1
  402. # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
  403. movdqa sv2,t2
  404. pmuludq hc4,t2
  405. paddq t2,t1
  406. # d1 = t1[0] + t1[1]
  407. movdqa t1,t2
  408. psrldq $8,t2
  409. paddq t2,t1
  410. movq t1,d1
  411. # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
  412. movdqa ru2,t1
  413. pmuludq hc0,t1
  414. # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
  415. movdqa ru1,t2
  416. pmuludq hc1,t2
  417. paddq t2,t1
  418. # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
  419. movdqa ru0,t2
  420. pmuludq hc2,t2
  421. paddq t2,t1
  422. # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
  423. movdqa sv4,t2
  424. pmuludq hc3,t2
  425. paddq t2,t1
  426. # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
  427. movdqa sv3,t2
  428. pmuludq hc4,t2
  429. paddq t2,t1
  430. # d2 = t1[0] + t1[1]
  431. movdqa t1,t2
  432. psrldq $8,t2
  433. paddq t2,t1
  434. movq t1,d2
  435. # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
  436. movdqa ru3,t1
  437. pmuludq hc0,t1
  438. # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
  439. movdqa ru2,t2
  440. pmuludq hc1,t2
  441. paddq t2,t1
  442. # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
  443. movdqa ru1,t2
  444. pmuludq hc2,t2
  445. paddq t2,t1
  446. # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
  447. movdqa ru0,t2
  448. pmuludq hc3,t2
  449. paddq t2,t1
  450. # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
  451. movdqa sv4,t2
  452. pmuludq hc4,t2
  453. paddq t2,t1
  454. # d3 = t1[0] + t1[1]
  455. movdqa t1,t2
  456. psrldq $8,t2
  457. paddq t2,t1
  458. movq t1,d3
  459. # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
  460. movdqa ru4,t1
  461. pmuludq hc0,t1
  462. # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
  463. movdqa ru3,t2
  464. pmuludq hc1,t2
  465. paddq t2,t1
  466. # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
  467. movdqa ru2,t2
  468. pmuludq hc2,t2
  469. paddq t2,t1
  470. # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
  471. movdqa ru1,t2
  472. pmuludq hc3,t2
  473. paddq t2,t1
  474. # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
  475. movdqa ru0,t2
  476. pmuludq hc4,t2
  477. paddq t2,t1
  478. # d4 = t1[0] + t1[1]
  479. movdqa t1,t2
  480. psrldq $8,t2
  481. paddq t2,t1
  482. movq t1,d4
  483. # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
  484. # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
  485. # amount. Careful: we must not assume the carry bits 'd0 >> 26',
  486. # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
  487. # integers. It's true in a single-block implementation, but not here.
  488. # d1 += d0 >> 26
  489. mov d0,%rax
  490. shr $26,%rax
  491. add %rax,d1
  492. # h0 = d0 & 0x3ffffff
  493. mov d0,%rbx
  494. and $0x3ffffff,%ebx
  495. # d2 += d1 >> 26
  496. mov d1,%rax
  497. shr $26,%rax
  498. add %rax,d2
  499. # h1 = d1 & 0x3ffffff
  500. mov d1,%rax
  501. and $0x3ffffff,%eax
  502. mov %eax,h1
  503. # d3 += d2 >> 26
  504. mov d2,%rax
  505. shr $26,%rax
  506. add %rax,d3
  507. # h2 = d2 & 0x3ffffff
  508. mov d2,%rax
  509. and $0x3ffffff,%eax
  510. mov %eax,h2
  511. # d4 += d3 >> 26
  512. mov d3,%rax
  513. shr $26,%rax
  514. add %rax,d4
  515. # h3 = d3 & 0x3ffffff
  516. mov d3,%rax
  517. and $0x3ffffff,%eax
  518. mov %eax,h3
  519. # h0 += (d4 >> 26) * 5
  520. mov d4,%rax
  521. shr $26,%rax
  522. lea (%rax,%rax,4),%rax
  523. add %rax,%rbx
  524. # h4 = d4 & 0x3ffffff
  525. mov d4,%rax
  526. and $0x3ffffff,%eax
  527. mov %eax,h4
  528. # h1 += h0 >> 26
  529. mov %rbx,%rax
  530. shr $26,%rax
  531. add %eax,h1
  532. # h0 = h0 & 0x3ffffff
  533. andl $0x3ffffff,%ebx
  534. mov %ebx,h0
  535. add $0x20,m
  536. dec %rcx
  537. jnz .Ldoblock2
  538. pop %r13
  539. pop %r12
  540. pop %rbx
  541. ret
  542. ENDPROC(poly1305_2block_sse2)