ghashp10-ppc.pl 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-2.0
  3. # This code is taken from the OpenSSL project but the author (Andy Polyakov)
  4. # has relicensed it under the GPLv2. Therefore this program is free software;
  5. # you can redistribute it and/or modify it under the terms of the GNU General
  6. # Public License version 2 as published by the Free Software Foundation.
  7. #
  8. # The original headers, including the original license headers, are
  9. # included below for completeness.
  10. # ====================================================================
  11. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12. # project. The module is, however, dual licensed under OpenSSL and
  13. # CRYPTOGAMS licenses depending on where you obtain it. For further
  14. # details see https://www.openssl.org/~appro/cryptogams/.
  15. # ====================================================================
  16. #
  17. # GHASH for PowerISA v2.07.
  18. #
  19. # July 2014
  20. #
  21. # Accurate performance measurements are problematic, because it's
  22. # always virtualized setup with possibly throttled processor.
  23. # Relative comparison is therefore more informative. This initial
  24. # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
  25. # faster than "4-bit" integer-only compiler-generated 64-bit code.
  26. # "Initial version" means that there is room for futher improvement.
  27. $flavour=shift;
  28. $output =shift;
  29. if ($flavour =~ /64/) {
  30. $SIZE_T=8;
  31. $LRSAVE=2*$SIZE_T;
  32. $STU="stdu";
  33. $POP="ld";
  34. $PUSH="std";
  35. } elsif ($flavour =~ /32/) {
  36. $SIZE_T=4;
  37. $LRSAVE=$SIZE_T;
  38. $STU="stwu";
  39. $POP="lwz";
  40. $PUSH="stw";
  41. } else { die "nonsense $flavour"; }
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  45. die "can't locate ppc-xlate.pl";
  46. open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
  47. my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
  48. my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
  49. my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
  50. my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
  51. my $vrsave="r12";
  52. my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
  53. $code=<<___;
  54. .machine "any"
  55. .text
  56. .globl .gcm_init_p10
  57. lis r0,0xfff0
  58. li r8,0x10
  59. mfspr $vrsave,256
  60. li r9,0x20
  61. mtspr 256,r0
  62. li r10,0x30
  63. lvx_u $H,0,r4 # load H
  64. le?xor r7,r7,r7
  65. le?addi r7,r7,0x8 # need a vperm start with 08
  66. le?lvsr 5,0,r7
  67. le?vspltisb 6,0x0f
  68. le?vxor 5,5,6 # set a b-endian mask
  69. le?vperm $H,$H,$H,5
  70. vspltisb $xC2,-16 # 0xf0
  71. vspltisb $t0,1 # one
  72. vaddubm $xC2,$xC2,$xC2 # 0xe0
  73. vxor $zero,$zero,$zero
  74. vor $xC2,$xC2,$t0 # 0xe1
  75. vsldoi $xC2,$xC2,$zero,15 # 0xe1...
  76. vsldoi $t1,$zero,$t0,1 # ...1
  77. vaddubm $xC2,$xC2,$xC2 # 0xc2...
  78. vspltisb $t2,7
  79. vor $xC2,$xC2,$t1 # 0xc2....01
  80. vspltb $t1,$H,0 # most significant byte
  81. vsl $H,$H,$t0 # H<<=1
  82. vsrab $t1,$t1,$t2 # broadcast carry bit
  83. vand $t1,$t1,$xC2
  84. vxor $H,$H,$t1 # twisted H
  85. vsldoi $H,$H,$H,8 # twist even more ...
  86. vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
  87. vsldoi $Hl,$zero,$H,8 # ... and split
  88. vsldoi $Hh,$H,$zero,8
  89. stvx_u $xC2,0,r3 # save pre-computed table
  90. stvx_u $Hl,r8,r3
  91. stvx_u $H, r9,r3
  92. stvx_u $Hh,r10,r3
  93. mtspr 256,$vrsave
  94. blr
  95. .long 0
  96. .byte 0,12,0x14,0,0,0,2,0
  97. .long 0
  98. .size .gcm_init_p10,.-.gcm_init_p10
  99. .globl .gcm_init_htable
  100. lis r0,0xfff0
  101. li r8,0x10
  102. mfspr $vrsave,256
  103. li r9,0x20
  104. mtspr 256,r0
  105. li r10,0x30
  106. lvx_u $H,0,r4 # load H
  107. vspltisb $xC2,-16 # 0xf0
  108. vspltisb $t0,1 # one
  109. vaddubm $xC2,$xC2,$xC2 # 0xe0
  110. vxor $zero,$zero,$zero
  111. vor $xC2,$xC2,$t0 # 0xe1
  112. vsldoi $xC2,$xC2,$zero,15 # 0xe1...
  113. vsldoi $t1,$zero,$t0,1 # ...1
  114. vaddubm $xC2,$xC2,$xC2 # 0xc2...
  115. vspltisb $t2,7
  116. vor $xC2,$xC2,$t1 # 0xc2....01
  117. vspltb $t1,$H,0 # most significant byte
  118. vsl $H,$H,$t0 # H<<=1
  119. vsrab $t1,$t1,$t2 # broadcast carry bit
  120. vand $t1,$t1,$xC2
  121. vxor $IN,$H,$t1 # twisted H
  122. vsldoi $H,$IN,$IN,8 # twist even more ...
  123. vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
  124. vsldoi $Hl,$zero,$H,8 # ... and split
  125. vsldoi $Hh,$H,$zero,8
  126. stvx_u $xC2,0,r3 # save pre-computed table
  127. stvx_u $Hl,r8,r3
  128. li r8,0x40
  129. stvx_u $H, r9,r3
  130. li r9,0x50
  131. stvx_u $Hh,r10,r3
  132. li r10,0x60
  133. vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
  134. vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
  135. vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
  136. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  137. vsldoi $t0,$Xm,$zero,8
  138. vsldoi $t1,$zero,$Xm,8
  139. vxor $Xl,$Xl,$t0
  140. vxor $Xh,$Xh,$t1
  141. vsldoi $Xl,$Xl,$Xl,8
  142. vxor $Xl,$Xl,$t2
  143. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  144. vpmsumd $Xl,$Xl,$xC2
  145. vxor $t1,$t1,$Xh
  146. vxor $IN1,$Xl,$t1
  147. vsldoi $H2,$IN1,$IN1,8
  148. vsldoi $H2l,$zero,$H2,8
  149. vsldoi $H2h,$H2,$zero,8
  150. stvx_u $H2l,r8,r3 # save H^2
  151. li r8,0x70
  152. stvx_u $H2,r9,r3
  153. li r9,0x80
  154. stvx_u $H2h,r10,r3
  155. li r10,0x90
  156. vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
  157. vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
  158. vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
  159. vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
  160. vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
  161. vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
  162. vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
  163. vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
  164. vsldoi $t0,$Xm,$zero,8
  165. vsldoi $t1,$zero,$Xm,8
  166. vsldoi $t4,$Xm1,$zero,8
  167. vsldoi $t5,$zero,$Xm1,8
  168. vxor $Xl,$Xl,$t0
  169. vxor $Xh,$Xh,$t1
  170. vxor $Xl1,$Xl1,$t4
  171. vxor $Xh1,$Xh1,$t5
  172. vsldoi $Xl,$Xl,$Xl,8
  173. vsldoi $Xl1,$Xl1,$Xl1,8
  174. vxor $Xl,$Xl,$t2
  175. vxor $Xl1,$Xl1,$t6
  176. vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
  177. vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
  178. vpmsumd $Xl,$Xl,$xC2
  179. vpmsumd $Xl1,$Xl1,$xC2
  180. vxor $t1,$t1,$Xh
  181. vxor $t5,$t5,$Xh1
  182. vxor $Xl,$Xl,$t1
  183. vxor $Xl1,$Xl1,$t5
  184. vsldoi $H,$Xl,$Xl,8
  185. vsldoi $H2,$Xl1,$Xl1,8
  186. vsldoi $Hl,$zero,$H,8
  187. vsldoi $Hh,$H,$zero,8
  188. vsldoi $H2l,$zero,$H2,8
  189. vsldoi $H2h,$H2,$zero,8
  190. stvx_u $Hl,r8,r3 # save H^3
  191. li r8,0xa0
  192. stvx_u $H,r9,r3
  193. li r9,0xb0
  194. stvx_u $Hh,r10,r3
  195. li r10,0xc0
  196. stvx_u $H2l,r8,r3 # save H^4
  197. stvx_u $H2,r9,r3
  198. stvx_u $H2h,r10,r3
  199. mtspr 256,$vrsave
  200. blr
  201. .long 0
  202. .byte 0,12,0x14,0,0,0,2,0
  203. .long 0
  204. .size .gcm_init_htable,.-.gcm_init_htable
  205. .globl .gcm_gmult_p10
  206. lis r0,0xfff8
  207. li r8,0x10
  208. mfspr $vrsave,256
  209. li r9,0x20
  210. mtspr 256,r0
  211. li r10,0x30
  212. lvx_u $IN,0,$Xip # load Xi
  213. lvx_u $Hl,r8,$Htbl # load pre-computed table
  214. le?lvsl $lemask,r0,r0
  215. lvx_u $H, r9,$Htbl
  216. le?vspltisb $t0,0x07
  217. lvx_u $Hh,r10,$Htbl
  218. le?vxor $lemask,$lemask,$t0
  219. lvx_u $xC2,0,$Htbl
  220. le?vperm $IN,$IN,$IN,$lemask
  221. vxor $zero,$zero,$zero
  222. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  223. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  224. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  225. vpmsumd $t2,$Xl,$xC2 # 1st phase
  226. vsldoi $t0,$Xm,$zero,8
  227. vsldoi $t1,$zero,$Xm,8
  228. vxor $Xl,$Xl,$t0
  229. vxor $Xh,$Xh,$t1
  230. vsldoi $Xl,$Xl,$Xl,8
  231. vxor $Xl,$Xl,$t2
  232. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  233. vpmsumd $Xl,$Xl,$xC2
  234. vxor $t1,$t1,$Xh
  235. vxor $Xl,$Xl,$t1
  236. le?vperm $Xl,$Xl,$Xl,$lemask
  237. stvx_u $Xl,0,$Xip # write out Xi
  238. mtspr 256,$vrsave
  239. blr
  240. .long 0
  241. .byte 0,12,0x14,0,0,0,2,0
  242. .long 0
  243. .size .gcm_gmult_p10,.-.gcm_gmult_p10
  244. .globl .gcm_ghash_p10
  245. lis r0,0xfff8
  246. li r8,0x10
  247. mfspr $vrsave,256
  248. li r9,0x20
  249. mtspr 256,r0
  250. li r10,0x30
  251. lvx_u $Xl,0,$Xip # load Xi
  252. lvx_u $Hl,r8,$Htbl # load pre-computed table
  253. le?lvsl $lemask,r0,r0
  254. lvx_u $H, r9,$Htbl
  255. le?vspltisb $t0,0x07
  256. lvx_u $Hh,r10,$Htbl
  257. le?vxor $lemask,$lemask,$t0
  258. lvx_u $xC2,0,$Htbl
  259. le?vperm $Xl,$Xl,$Xl,$lemask
  260. vxor $zero,$zero,$zero
  261. lvx_u $IN,0,$inp
  262. addi $inp,$inp,16
  263. subi $len,$len,16
  264. le?vperm $IN,$IN,$IN,$lemask
  265. vxor $IN,$IN,$Xl
  266. b Loop
  267. .align 5
  268. Loop:
  269. subic $len,$len,16
  270. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  271. subfe. r0,r0,r0 # borrow?-1:0
  272. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  273. and r0,r0,$len
  274. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  275. add $inp,$inp,r0
  276. vpmsumd $t2,$Xl,$xC2 # 1st phase
  277. vsldoi $t0,$Xm,$zero,8
  278. vsldoi $t1,$zero,$Xm,8
  279. vxor $Xl,$Xl,$t0
  280. vxor $Xh,$Xh,$t1
  281. vsldoi $Xl,$Xl,$Xl,8
  282. vxor $Xl,$Xl,$t2
  283. lvx_u $IN,0,$inp
  284. addi $inp,$inp,16
  285. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  286. vpmsumd $Xl,$Xl,$xC2
  287. le?vperm $IN,$IN,$IN,$lemask
  288. vxor $t1,$t1,$Xh
  289. vxor $IN,$IN,$t1
  290. vxor $IN,$IN,$Xl
  291. beq Loop # did $len-=16 borrow?
  292. vxor $Xl,$Xl,$t1
  293. le?vperm $Xl,$Xl,$Xl,$lemask
  294. stvx_u $Xl,0,$Xip # write out Xi
  295. mtspr 256,$vrsave
  296. blr
  297. .long 0
  298. .byte 0,12,0x14,0,0,0,4,0
  299. .long 0
  300. .size .gcm_ghash_p10,.-.gcm_ghash_p10
  301. .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  302. .align 2
  303. ___
  304. foreach (split("\n",$code)) {
  305. if ($flavour =~ /le$/o) { # little-endian
  306. s/le\?//o or
  307. s/be\?/#be#/o;
  308. } else {
  309. s/le\?/#le#/o or
  310. s/be\?//o;
  311. }
  312. print $_,"\n";
  313. }
  314. close STDOUT; # enforce flush