morus640-sse2-asm.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. /*
  2. * SSE2 implementation of MORUS-640
  3. *
  4. * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  5. * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms of the GNU General Public License version 2 as published
  9. * by the Free Software Foundation.
  10. */
  11. #include <linux/linkage.h>
  12. #include <asm/frame.h>
  13. #define SHUFFLE_MASK(i0, i1, i2, i3) \
  14. (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
  15. #define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
  16. #define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
  17. #define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
  18. #define STATE0 %xmm0
  19. #define STATE1 %xmm1
  20. #define STATE2 %xmm2
  21. #define STATE3 %xmm3
  22. #define STATE4 %xmm4
  23. #define KEY %xmm5
  24. #define MSG %xmm5
  25. #define T0 %xmm6
  26. #define T1 %xmm7
  27. .section .rodata.cst16.morus640_const, "aM", @progbits, 32
  28. .align 16
  29. .Lmorus640_const_0:
  30. .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
  31. .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
  32. .Lmorus640_const_1:
  33. .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
  34. .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
  35. .section .rodata.cst16.morus640_counter, "aM", @progbits, 16
  36. .align 16
  37. .Lmorus640_counter:
  38. .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  39. .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  40. .text
  41. .macro morus640_round s0, s1, s2, s3, s4, b, w
  42. movdqa \s1, T0
  43. pand \s2, T0
  44. pxor T0, \s0
  45. pxor \s3, \s0
  46. movdqa \s0, T0
  47. pslld $\b, T0
  48. psrld $(32 - \b), \s0
  49. pxor T0, \s0
  50. pshufd $\w, \s3, \s3
  51. .endm
  52. /*
  53. * __morus640_update: internal ABI
  54. * input:
  55. * STATE[0-4] - input state
  56. * MSG - message block
  57. * output:
  58. * STATE[0-4] - output state
  59. * changed:
  60. * T0
  61. */
  62. __morus640_update:
  63. morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
  64. pxor MSG, STATE1
  65. morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
  66. pxor MSG, STATE2
  67. morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
  68. pxor MSG, STATE3
  69. morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
  70. pxor MSG, STATE4
  71. morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
  72. ret
  73. ENDPROC(__morus640_update)
  74. /*
  75. * __morus640_update_zero: internal ABI
  76. * input:
  77. * STATE[0-4] - input state
  78. * output:
  79. * STATE[0-4] - output state
  80. * changed:
  81. * T0
  82. */
  83. __morus640_update_zero:
  84. morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
  85. morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
  86. morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
  87. morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
  88. morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
  89. ret
  90. ENDPROC(__morus640_update_zero)
  91. /*
  92. * __load_partial: internal ABI
  93. * input:
  94. * %rsi - src
  95. * %rcx - bytes
  96. * output:
  97. * MSG - message block
  98. * changed:
  99. * T0
  100. * %r8
  101. * %r9
  102. */
  103. __load_partial:
  104. xor %r9d, %r9d
  105. pxor MSG, MSG
  106. mov %rcx, %r8
  107. and $0x1, %r8
  108. jz .Lld_partial_1
  109. mov %rcx, %r8
  110. and $0x1E, %r8
  111. add %rsi, %r8
  112. mov (%r8), %r9b
  113. .Lld_partial_1:
  114. mov %rcx, %r8
  115. and $0x2, %r8
  116. jz .Lld_partial_2
  117. mov %rcx, %r8
  118. and $0x1C, %r8
  119. add %rsi, %r8
  120. shl $16, %r9
  121. mov (%r8), %r9w
  122. .Lld_partial_2:
  123. mov %rcx, %r8
  124. and $0x4, %r8
  125. jz .Lld_partial_4
  126. mov %rcx, %r8
  127. and $0x18, %r8
  128. add %rsi, %r8
  129. shl $32, %r9
  130. mov (%r8), %r8d
  131. xor %r8, %r9
  132. .Lld_partial_4:
  133. movq %r9, MSG
  134. mov %rcx, %r8
  135. and $0x8, %r8
  136. jz .Lld_partial_8
  137. mov %rcx, %r8
  138. and $0x10, %r8
  139. add %rsi, %r8
  140. pslldq $8, MSG
  141. movq (%r8), T0
  142. pxor T0, MSG
  143. .Lld_partial_8:
  144. ret
  145. ENDPROC(__load_partial)
  146. /*
  147. * __store_partial: internal ABI
  148. * input:
  149. * %rdx - dst
  150. * %rcx - bytes
  151. * output:
  152. * T0 - message block
  153. * changed:
  154. * %r8
  155. * %r9
  156. * %r10
  157. */
  158. __store_partial:
  159. mov %rcx, %r8
  160. mov %rdx, %r9
  161. movq T0, %r10
  162. cmp $8, %r8
  163. jl .Lst_partial_8
  164. mov %r10, (%r9)
  165. psrldq $8, T0
  166. movq T0, %r10
  167. sub $8, %r8
  168. add $8, %r9
  169. .Lst_partial_8:
  170. cmp $4, %r8
  171. jl .Lst_partial_4
  172. mov %r10d, (%r9)
  173. shr $32, %r10
  174. sub $4, %r8
  175. add $4, %r9
  176. .Lst_partial_4:
  177. cmp $2, %r8
  178. jl .Lst_partial_2
  179. mov %r10w, (%r9)
  180. shr $16, %r10
  181. sub $2, %r8
  182. add $2, %r9
  183. .Lst_partial_2:
  184. cmp $1, %r8
  185. jl .Lst_partial_1
  186. mov %r10b, (%r9)
  187. .Lst_partial_1:
  188. ret
  189. ENDPROC(__store_partial)
  190. /*
  191. * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
  192. */
  193. ENTRY(crypto_morus640_sse2_init)
  194. FRAME_BEGIN
  195. /* load IV: */
  196. movdqu (%rdx), STATE0
  197. /* load key: */
  198. movdqu (%rsi), KEY
  199. movdqa KEY, STATE1
  200. /* load all ones: */
  201. pcmpeqd STATE2, STATE2
  202. /* load the constants: */
  203. movdqa .Lmorus640_const_0, STATE3
  204. movdqa .Lmorus640_const_1, STATE4
  205. /* update 16 times with zero: */
  206. call __morus640_update_zero
  207. call __morus640_update_zero
  208. call __morus640_update_zero
  209. call __morus640_update_zero
  210. call __morus640_update_zero
  211. call __morus640_update_zero
  212. call __morus640_update_zero
  213. call __morus640_update_zero
  214. call __morus640_update_zero
  215. call __morus640_update_zero
  216. call __morus640_update_zero
  217. call __morus640_update_zero
  218. call __morus640_update_zero
  219. call __morus640_update_zero
  220. call __morus640_update_zero
  221. call __morus640_update_zero
  222. /* xor-in the key again after updates: */
  223. pxor KEY, STATE1
  224. /* store the state: */
  225. movdqu STATE0, (0 * 16)(%rdi)
  226. movdqu STATE1, (1 * 16)(%rdi)
  227. movdqu STATE2, (2 * 16)(%rdi)
  228. movdqu STATE3, (3 * 16)(%rdi)
  229. movdqu STATE4, (4 * 16)(%rdi)
  230. FRAME_END
  231. ret
  232. ENDPROC(crypto_morus640_sse2_init)
  233. /*
  234. * void crypto_morus640_sse2_ad(void *state, const void *data,
  235. * unsigned int length);
  236. */
  237. ENTRY(crypto_morus640_sse2_ad)
  238. FRAME_BEGIN
  239. cmp $16, %rdx
  240. jb .Lad_out
  241. /* load the state: */
  242. movdqu (0 * 16)(%rdi), STATE0
  243. movdqu (1 * 16)(%rdi), STATE1
  244. movdqu (2 * 16)(%rdi), STATE2
  245. movdqu (3 * 16)(%rdi), STATE3
  246. movdqu (4 * 16)(%rdi), STATE4
  247. mov %rsi, %r8
  248. and $0xF, %r8
  249. jnz .Lad_u_loop
  250. .align 4
  251. .Lad_a_loop:
  252. movdqa (%rsi), MSG
  253. call __morus640_update
  254. sub $16, %rdx
  255. add $16, %rsi
  256. cmp $16, %rdx
  257. jge .Lad_a_loop
  258. jmp .Lad_cont
  259. .align 4
  260. .Lad_u_loop:
  261. movdqu (%rsi), MSG
  262. call __morus640_update
  263. sub $16, %rdx
  264. add $16, %rsi
  265. cmp $16, %rdx
  266. jge .Lad_u_loop
  267. .Lad_cont:
  268. /* store the state: */
  269. movdqu STATE0, (0 * 16)(%rdi)
  270. movdqu STATE1, (1 * 16)(%rdi)
  271. movdqu STATE2, (2 * 16)(%rdi)
  272. movdqu STATE3, (3 * 16)(%rdi)
  273. movdqu STATE4, (4 * 16)(%rdi)
  274. .Lad_out:
  275. FRAME_END
  276. ret
  277. ENDPROC(crypto_morus640_sse2_ad)
  278. /*
  279. * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
  280. * unsigned int length);
  281. */
  282. ENTRY(crypto_morus640_sse2_enc)
  283. FRAME_BEGIN
  284. cmp $16, %rcx
  285. jb .Lenc_out
  286. /* load the state: */
  287. movdqu (0 * 16)(%rdi), STATE0
  288. movdqu (1 * 16)(%rdi), STATE1
  289. movdqu (2 * 16)(%rdi), STATE2
  290. movdqu (3 * 16)(%rdi), STATE3
  291. movdqu (4 * 16)(%rdi), STATE4
  292. mov %rsi, %r8
  293. or %rdx, %r8
  294. and $0xF, %r8
  295. jnz .Lenc_u_loop
  296. .align 4
  297. .Lenc_a_loop:
  298. movdqa (%rsi), MSG
  299. movdqa MSG, T0
  300. pxor STATE0, T0
  301. pshufd $MASK3, STATE1, T1
  302. pxor T1, T0
  303. movdqa STATE2, T1
  304. pand STATE3, T1
  305. pxor T1, T0
  306. movdqa T0, (%rdx)
  307. call __morus640_update
  308. sub $16, %rcx
  309. add $16, %rsi
  310. add $16, %rdx
  311. cmp $16, %rcx
  312. jge .Lenc_a_loop
  313. jmp .Lenc_cont
  314. .align 4
  315. .Lenc_u_loop:
  316. movdqu (%rsi), MSG
  317. movdqa MSG, T0
  318. pxor STATE0, T0
  319. pshufd $MASK3, STATE1, T1
  320. pxor T1, T0
  321. movdqa STATE2, T1
  322. pand STATE3, T1
  323. pxor T1, T0
  324. movdqu T0, (%rdx)
  325. call __morus640_update
  326. sub $16, %rcx
  327. add $16, %rsi
  328. add $16, %rdx
  329. cmp $16, %rcx
  330. jge .Lenc_u_loop
  331. .Lenc_cont:
  332. /* store the state: */
  333. movdqu STATE0, (0 * 16)(%rdi)
  334. movdqu STATE1, (1 * 16)(%rdi)
  335. movdqu STATE2, (2 * 16)(%rdi)
  336. movdqu STATE3, (3 * 16)(%rdi)
  337. movdqu STATE4, (4 * 16)(%rdi)
  338. .Lenc_out:
  339. FRAME_END
  340. ret
  341. ENDPROC(crypto_morus640_sse2_enc)
  342. /*
  343. * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
  344. * unsigned int length);
  345. */
  346. ENTRY(crypto_morus640_sse2_enc_tail)
  347. FRAME_BEGIN
  348. /* load the state: */
  349. movdqu (0 * 16)(%rdi), STATE0
  350. movdqu (1 * 16)(%rdi), STATE1
  351. movdqu (2 * 16)(%rdi), STATE2
  352. movdqu (3 * 16)(%rdi), STATE3
  353. movdqu (4 * 16)(%rdi), STATE4
  354. /* encrypt message: */
  355. call __load_partial
  356. movdqa MSG, T0
  357. pxor STATE0, T0
  358. pshufd $MASK3, STATE1, T1
  359. pxor T1, T0
  360. movdqa STATE2, T1
  361. pand STATE3, T1
  362. pxor T1, T0
  363. call __store_partial
  364. call __morus640_update
  365. /* store the state: */
  366. movdqu STATE0, (0 * 16)(%rdi)
  367. movdqu STATE1, (1 * 16)(%rdi)
  368. movdqu STATE2, (2 * 16)(%rdi)
  369. movdqu STATE3, (3 * 16)(%rdi)
  370. movdqu STATE4, (4 * 16)(%rdi)
  371. FRAME_END
  372. ret
  373. ENDPROC(crypto_morus640_sse2_enc_tail)
  374. /*
  375. * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
  376. * unsigned int length);
  377. */
  378. ENTRY(crypto_morus640_sse2_dec)
  379. FRAME_BEGIN
  380. cmp $16, %rcx
  381. jb .Ldec_out
  382. /* load the state: */
  383. movdqu (0 * 16)(%rdi), STATE0
  384. movdqu (1 * 16)(%rdi), STATE1
  385. movdqu (2 * 16)(%rdi), STATE2
  386. movdqu (3 * 16)(%rdi), STATE3
  387. movdqu (4 * 16)(%rdi), STATE4
  388. mov %rsi, %r8
  389. or %rdx, %r8
  390. and $0xF, %r8
  391. jnz .Ldec_u_loop
  392. .align 4
  393. .Ldec_a_loop:
  394. movdqa (%rsi), MSG
  395. pxor STATE0, MSG
  396. pshufd $MASK3, STATE1, T0
  397. pxor T0, MSG
  398. movdqa STATE2, T0
  399. pand STATE3, T0
  400. pxor T0, MSG
  401. movdqa MSG, (%rdx)
  402. call __morus640_update
  403. sub $16, %rcx
  404. add $16, %rsi
  405. add $16, %rdx
  406. cmp $16, %rcx
  407. jge .Ldec_a_loop
  408. jmp .Ldec_cont
  409. .align 4
  410. .Ldec_u_loop:
  411. movdqu (%rsi), MSG
  412. pxor STATE0, MSG
  413. pshufd $MASK3, STATE1, T0
  414. pxor T0, MSG
  415. movdqa STATE2, T0
  416. pand STATE3, T0
  417. pxor T0, MSG
  418. movdqu MSG, (%rdx)
  419. call __morus640_update
  420. sub $16, %rcx
  421. add $16, %rsi
  422. add $16, %rdx
  423. cmp $16, %rcx
  424. jge .Ldec_u_loop
  425. .Ldec_cont:
  426. /* store the state: */
  427. movdqu STATE0, (0 * 16)(%rdi)
  428. movdqu STATE1, (1 * 16)(%rdi)
  429. movdqu STATE2, (2 * 16)(%rdi)
  430. movdqu STATE3, (3 * 16)(%rdi)
  431. movdqu STATE4, (4 * 16)(%rdi)
  432. .Ldec_out:
  433. FRAME_END
  434. ret
  435. ENDPROC(crypto_morus640_sse2_dec)
  436. /*
  437. * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
  438. * unsigned int length);
  439. */
  440. ENTRY(crypto_morus640_sse2_dec_tail)
  441. FRAME_BEGIN
  442. /* load the state: */
  443. movdqu (0 * 16)(%rdi), STATE0
  444. movdqu (1 * 16)(%rdi), STATE1
  445. movdqu (2 * 16)(%rdi), STATE2
  446. movdqu (3 * 16)(%rdi), STATE3
  447. movdqu (4 * 16)(%rdi), STATE4
  448. /* decrypt message: */
  449. call __load_partial
  450. pxor STATE0, MSG
  451. pshufd $MASK3, STATE1, T0
  452. pxor T0, MSG
  453. movdqa STATE2, T0
  454. pand STATE3, T0
  455. pxor T0, MSG
  456. movdqa MSG, T0
  457. call __store_partial
  458. /* mask with byte count: */
  459. movq %rcx, T0
  460. punpcklbw T0, T0
  461. punpcklbw T0, T0
  462. punpcklbw T0, T0
  463. punpcklbw T0, T0
  464. movdqa .Lmorus640_counter, T1
  465. pcmpgtb T1, T0
  466. pand T0, MSG
  467. call __morus640_update
  468. /* store the state: */
  469. movdqu STATE0, (0 * 16)(%rdi)
  470. movdqu STATE1, (1 * 16)(%rdi)
  471. movdqu STATE2, (2 * 16)(%rdi)
  472. movdqu STATE3, (3 * 16)(%rdi)
  473. movdqu STATE4, (4 * 16)(%rdi)
  474. FRAME_END
  475. ret
  476. ENDPROC(crypto_morus640_sse2_dec_tail)
  477. /*
  478. * void crypto_morus640_sse2_final(void *state, void *tag_xor,
  479. * u64 assoclen, u64 cryptlen);
  480. */
  481. ENTRY(crypto_morus640_sse2_final)
  482. FRAME_BEGIN
  483. /* load the state: */
  484. movdqu (0 * 16)(%rdi), STATE0
  485. movdqu (1 * 16)(%rdi), STATE1
  486. movdqu (2 * 16)(%rdi), STATE2
  487. movdqu (3 * 16)(%rdi), STATE3
  488. movdqu (4 * 16)(%rdi), STATE4
  489. /* xor state[0] into state[4]: */
  490. pxor STATE0, STATE4
  491. /* prepare length block: */
  492. movq %rdx, MSG
  493. movq %rcx, T0
  494. pslldq $8, T0
  495. pxor T0, MSG
  496. psllq $3, MSG /* multiply by 8 (to get bit count) */
  497. /* update state: */
  498. call __morus640_update
  499. call __morus640_update
  500. call __morus640_update
  501. call __morus640_update
  502. call __morus640_update
  503. call __morus640_update
  504. call __morus640_update
  505. call __morus640_update
  506. call __morus640_update
  507. call __morus640_update
  508. /* xor tag: */
  509. movdqu (%rsi), MSG
  510. pxor STATE0, MSG
  511. pshufd $MASK3, STATE1, T0
  512. pxor T0, MSG
  513. movdqa STATE2, T0
  514. pand STATE3, T0
  515. pxor T0, MSG
  516. movdqu MSG, (%rsi)
  517. FRAME_END
  518. ret
  519. ENDPROC(crypto_morus640_sse2_final)