aes-gcm-aesni-x86_64.S 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128
  1. /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
  2. //
  3. // AES-NI optimized AES-GCM for x86_64
  4. //
  5. // Copyright 2024 Google LLC
  6. //
  7. // Author: Eric Biggers <ebiggers@google.com>
  8. //
  9. //------------------------------------------------------------------------------
  10. //
  11. // This file is dual-licensed, meaning that you can use it under your choice of
  12. // either of the following two licenses:
  13. //
  14. // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
  15. // of the License at
  16. //
  17. // http://www.apache.org/licenses/LICENSE-2.0
  18. //
  19. // Unless required by applicable law or agreed to in writing, software
  20. // distributed under the License is distributed on an "AS IS" BASIS,
  21. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22. // See the License for the specific language governing permissions and
  23. // limitations under the License.
  24. //
  25. // or
  26. //
  27. // Redistribution and use in source and binary forms, with or without
  28. // modification, are permitted provided that the following conditions are met:
  29. //
  30. // 1. Redistributions of source code must retain the above copyright notice,
  31. // this list of conditions and the following disclaimer.
  32. //
  33. // 2. Redistributions in binary form must reproduce the above copyright
  34. // notice, this list of conditions and the following disclaimer in the
  35. // documentation and/or other materials provided with the distribution.
  36. //
  37. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  38. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  39. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  40. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  41. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  42. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  43. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  44. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  45. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  46. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  47. // POSSIBILITY OF SUCH DAMAGE.
  48. //
  49. //------------------------------------------------------------------------------
  50. //
  51. // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
  52. // support the original set of AES instructions, i.e. AES-NI. Two
  53. // implementations are provided, one that uses AVX and one that doesn't. They
  54. // are very similar, being generated by the same macros. The only difference is
  55. // that the AVX implementation takes advantage of VEX-coded instructions in some
  56. // places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
  57. // implementation does *not* use 256-bit vectors, as AES is not supported on
  58. // 256-bit vectors until the VAES feature (which this file doesn't target).
  59. //
  60. // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
  61. // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
  62. // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
  63. //
  64. // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
  65. // more thoroughly commented. This file has the following notable changes:
  66. //
  67. // - The vector length is fixed at 128-bit, i.e. xmm registers. This means
  68. // there is only one AES block (and GHASH block) per register.
  69. //
  70. // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
  71. // 32. We work around this by being much more careful about using
  72. // registers, relying heavily on loads to load values as they are needed.
  73. //
  74. // - Masking is not available either. We work around this by implementing
  75. // partial block loads and stores using overlapping scalar loads and stores
  76. // combined with shifts and SSE4.1 insertion and extraction instructions.
  77. //
  78. // - The main loop is organized differently due to the different design
  79. // constraints. First, with just one AES block per SIMD register, on some
  80. // CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
  81. // do an 8-register wide loop. Considering that and the fact that we have
  82. // just 16 SIMD registers to work with, it's not feasible to cache AES
  83. // round keys and GHASH key powers in registers across loop iterations.
  84. // That's not ideal, but also not actually that bad, since loads can run in
  85. // parallel with other instructions. Significantly, this also makes it
  86. // possible to roll up the inner loops, relying on hardware loop unrolling
  87. // instead of software loop unrolling, greatly reducing code size.
  88. //
  89. // - We implement the GHASH multiplications in the main loop using Karatsuba
  90. // multiplication instead of schoolbook multiplication. This saves one
  91. // pclmulqdq instruction per block, at the cost of one 64-bit load, one
  92. // pshufd, and 0.25 pxors per block. (This is without the three-argument
  93. // XOR support that would be provided by AVX512 / AVX10, which would be
  94. // more beneficial to schoolbook than Karatsuba.)
  95. //
  96. // As a rough approximation, we can assume that Karatsuba multiplication is
  97. // faster than schoolbook multiplication in this context if one pshufd and
  98. // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
  99. // load is "free" due to running in parallel with arithmetic instructions.)
  100. // This is true on AMD CPUs, including all that support pclmulqdq up to at
  101. // least Zen 3. It's also true on older Intel CPUs: Westmere through
  102. // Haswell on the Core side, and Silvermont through Goldmont Plus on the
  103. // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
  104. // benefit of Karatsuba should be substantial. On newer Intel CPUs,
  105. // schoolbook multiplication should be faster, but only marginally.
  106. //
  107. // Not all these CPUs were available to be tested. However, benchmarks on
  108. // available CPUs suggest that this approximation is plausible. Switching
  109. // to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
  110. // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
  111. // Considering that and the fact that Karatsuba should be even more
  112. // beneficial on older Intel CPUs, it seems like the right choice here.
  113. //
  114. // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
  115. // saved by using a multiplication-less reduction method. We don't do that
  116. // because it would require a large number of shift and xor instructions,
  117. // making it less worthwhile and likely harmful on newer CPUs.
  118. //
  119. // It does make sense to sometimes use a different reduction optimization
  120. // that saves a pclmulqdq, though: precompute the hash key times x^64, and
  121. // multiply the low half of the data block by the hash key with the extra
  122. // factor of x^64. This eliminates one step of the reduction. However,
  123. // this is incompatible with Karatsuba multiplication. Therefore, for
  124. // multi-block processing we use Karatsuba multiplication with a regular
  125. // reduction. For single-block processing, we use the x^64 optimization.
  126. #include <linux/linkage.h>
  127. .section .rodata
  128. .p2align 4
  129. .Lbswap_mask:
  130. .octa 0x000102030405060708090a0b0c0d0e0f
  131. .Lgfpoly:
  132. .quad 0xc200000000000000
  133. .Lone:
  134. .quad 1
  135. .Lgfpoly_and_internal_carrybit:
  136. .octa 0xc2000000000000010000000000000001
  137. // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
  138. // 'len' 0xff bytes and the rest zeroes.
  139. .Lzeropad_mask:
  140. .octa 0xffffffffffffffffffffffffffffffff
  141. .octa 0
  142. // Offsets in struct aes_gcm_key_aesni
  143. #define OFFSETOF_AESKEYLEN 480
  144. #define OFFSETOF_H_POWERS 496
  145. #define OFFSETOF_H_POWERS_XORED 624
  146. #define OFFSETOF_H_TIMES_X64 688
  147. .text
  148. // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
  149. // assumes that all operands are distinct and that any mem operand is aligned.
  150. .macro _vpclmulqdq imm, src1, src2, dst
  151. .if USE_AVX
  152. vpclmulqdq \imm, \src1, \src2, \dst
  153. .else
  154. movdqa \src2, \dst
  155. pclmulqdq \imm, \src1, \dst
  156. .endif
  157. .endm
  158. // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
  159. // that all operands are distinct and that any mem operand is aligned.
  160. .macro _vpshufb src1, src2, dst
  161. .if USE_AVX
  162. vpshufb \src1, \src2, \dst
  163. .else
  164. movdqa \src2, \dst
  165. pshufb \src1, \dst
  166. .endif
  167. .endm
  168. // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
  169. // all operands are distinct.
  170. .macro _vpand src1, src2, dst
  171. .if USE_AVX
  172. vpand \src1, \src2, \dst
  173. .else
  174. movdqu \src1, \dst
  175. pand \src2, \dst
  176. .endif
  177. .endm
  178. // XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
  179. // be a temporary xmm register.
  180. .macro _xor_mem_to_reg mem, reg, tmp
  181. .if USE_AVX
  182. vpxor \mem, \reg, \reg
  183. .else
  184. movdqu \mem, \tmp
  185. pxor \tmp, \reg
  186. .endif
  187. .endm
  188. // Test the unaligned memory operand \mem against the xmm register \reg. \tmp
  189. // must be a temporary xmm register.
  190. .macro _test_mem mem, reg, tmp
  191. .if USE_AVX
  192. vptest \mem, \reg
  193. .else
  194. movdqu \mem, \tmp
  195. ptest \tmp, \reg
  196. .endif
  197. .endm
  198. // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
  199. // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
  200. .macro _load_partial_block src, dst, tmp64, tmp32
  201. sub $8, %ecx // LEN - 8
  202. jle .Lle8\@
  203. // Load 9 <= LEN <= 15 bytes.
  204. movq (\src), \dst // Load first 8 bytes
  205. mov (\src, %rcx), %rax // Load last 8 bytes
  206. neg %ecx
  207. shl $3, %ecx
  208. shr %cl, %rax // Discard overlapping bytes
  209. pinsrq $1, %rax, \dst
  210. jmp .Ldone\@
  211. .Lle8\@:
  212. add $4, %ecx // LEN - 4
  213. jl .Llt4\@
  214. // Load 4 <= LEN <= 8 bytes.
  215. mov (\src), %eax // Load first 4 bytes
  216. mov (\src, %rcx), \tmp32 // Load last 4 bytes
  217. jmp .Lcombine\@
  218. .Llt4\@:
  219. // Load 1 <= LEN <= 3 bytes.
  220. add $2, %ecx // LEN - 2
  221. movzbl (\src), %eax // Load first byte
  222. jl .Lmovq\@
  223. movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
  224. .Lcombine\@:
  225. shl $3, %ecx
  226. shl %cl, \tmp64
  227. or \tmp64, %rax // Combine the two parts
  228. .Lmovq\@:
  229. movq %rax, \dst
  230. .Ldone\@:
  231. .endm
  232. // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
  233. // Clobbers %rax, %rcx, and %rsi.
  234. .macro _store_partial_block src, dst
  235. sub $8, %ecx // LEN - 8
  236. jl .Llt8\@
  237. // Store 8 <= LEN <= 15 bytes.
  238. pextrq $1, \src, %rax
  239. mov %ecx, %esi
  240. shl $3, %ecx
  241. ror %cl, %rax
  242. mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
  243. movq \src, (\dst) // Store first 8 bytes
  244. jmp .Ldone\@
  245. .Llt8\@:
  246. add $4, %ecx // LEN - 4
  247. jl .Llt4\@
  248. // Store 4 <= LEN <= 7 bytes.
  249. pextrd $1, \src, %eax
  250. mov %ecx, %esi
  251. shl $3, %ecx
  252. ror %cl, %eax
  253. mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
  254. movd \src, (\dst) // Store first 4 bytes
  255. jmp .Ldone\@
  256. .Llt4\@:
  257. // Store 1 <= LEN <= 3 bytes.
  258. pextrb $0, \src, 0(\dst)
  259. cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
  260. jl .Ldone\@
  261. pextrb $1, \src, 1(\dst)
  262. je .Ldone\@
  263. pextrb $2, \src, 2(\dst)
  264. .Ldone\@:
  265. .endm
  266. // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
  267. // \b. To complete all steps, this must be invoked with \i=0 through \i=9.
  268. // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
  269. // .Lgfpoly constant, and \t0-\t1 must be temporary registers.
  270. .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
  271. // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
  272. .if \i == 0
  273. _vpclmulqdq $0x01, \a, \b, \t0
  274. .elseif \i == 1
  275. _vpclmulqdq $0x00, \a_times_x64, \b, \t1
  276. .elseif \i == 2
  277. pxor \t1, \t0
  278. // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
  279. .elseif \i == 3
  280. _vpclmulqdq $0x11, \a, \b, \t1
  281. .elseif \i == 4
  282. pclmulqdq $0x10, \a_times_x64, \b
  283. .elseif \i == 5
  284. pxor \t1, \b
  285. .elseif \i == 6
  286. // Fold MI into HI.
  287. pshufd $0x4e, \t0, \t1 // Swap halves of MI
  288. .elseif \i == 7
  289. pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
  290. .elseif \i == 8
  291. pxor \t1, \b
  292. .elseif \i == 9
  293. pxor \t0, \b
  294. .endif
  295. .endm
  296. // GHASH-multiply \a by \b and store the reduced product in \b.
  297. // See _ghash_mul_step for details.
  298. .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
  299. .irp i, 0,1,2,3,4,5,6,7,8,9
  300. _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
  301. .endr
  302. .endm
  303. // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
  304. // This does Karatsuba multiplication and must be paired with _ghash_reduce. On
  305. // the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
  306. // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
  307. .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
  308. // LO += a_L * b_L
  309. _vpclmulqdq $0x00, \a, \b, \t0
  310. pxor \t0, \lo
  311. // b_L + b_H
  312. pshufd $0x4e, \b, \t0
  313. pxor \b, \t0
  314. // HI += a_H * b_H
  315. pclmulqdq $0x11, \a, \b
  316. pxor \b, \hi
  317. // MI += (a_L + a_H) * (b_L + b_H)
  318. pclmulqdq $0x00, \a_xored, \t0
  319. pxor \t0, \mi
  320. .endm
  321. // Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
  322. // This assumes that _ghash_mul_noreduce was used.
  323. .macro _ghash_reduce lo, mi, hi, dst, t0
  324. movq .Lgfpoly(%rip), \t0
  325. // MI += LO + HI (needed because we used Karatsuba multiplication)
  326. pxor \lo, \mi
  327. pxor \hi, \mi
  328. // Fold LO into MI.
  329. pshufd $0x4e, \lo, \dst
  330. pclmulqdq $0x00, \t0, \lo
  331. pxor \dst, \mi
  332. pxor \lo, \mi
  333. // Fold MI into HI.
  334. pshufd $0x4e, \mi, \dst
  335. pclmulqdq $0x00, \t0, \mi
  336. pxor \hi, \dst
  337. pxor \mi, \dst
  338. .endm
  339. // Do the first step of the GHASH update of a set of 8 ciphertext blocks.
  340. //
  341. // The whole GHASH update does:
  342. //
  343. // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
  344. // blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
  345. //
  346. // This macro just does the first step: it does the unreduced multiplication
  347. // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
  348. // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
  349. // inner block counter in %rax, which is a value that counts up by 8 for each
  350. // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
  351. //
  352. // To reduce the number of pclmulqdq instructions required, both this macro and
  353. // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
  354. // multiplication. See the file comment for more details about this choice.
  355. //
  356. // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
  357. // encrypting, or SRC if decrypting. They also expect the precomputed hash key
  358. // powers H^i and their XOR'd-together halves to be available in the struct
  359. // pointed to by KEY. Both macros clobber TMP[0-2].
  360. .macro _ghash_update_begin_8x enc
  361. // Initialize the inner block counter.
  362. xor %eax, %eax
  363. // Load the highest hash key power, H^8.
  364. movdqa OFFSETOF_H_POWERS(KEY), TMP0
  365. // Load the first ciphertext block and byte-reflect it.
  366. .if \enc
  367. movdqu (DST), TMP1
  368. .else
  369. movdqu (SRC), TMP1
  370. .endif
  371. pshufb BSWAP_MASK, TMP1
  372. // Add the GHASH accumulator to the ciphertext block to get the block
  373. // 'b' that needs to be multiplied with the hash key power 'a'.
  374. pxor TMP1, GHASH_ACC
  375. // b_L + b_H
  376. pshufd $0x4e, GHASH_ACC, MI
  377. pxor GHASH_ACC, MI
  378. // LO = a_L * b_L
  379. _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
  380. // HI = a_H * b_H
  381. pclmulqdq $0x11, TMP0, GHASH_ACC
  382. // MI = (a_L + a_H) * (b_L + b_H)
  383. pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
  384. .endm
  385. // Continue the GHASH update of 8 ciphertext blocks as described above by doing
  386. // an unreduced multiplication of the next ciphertext block by the next lowest
  387. // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
  388. .macro _ghash_update_continue_8x enc
  389. add $8, %eax
  390. // Load the next lowest key power.
  391. movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
  392. // Load the next ciphertext block and byte-reflect it.
  393. .if \enc
  394. movdqu (DST,%rax,2), TMP1
  395. .else
  396. movdqu (SRC,%rax,2), TMP1
  397. .endif
  398. pshufb BSWAP_MASK, TMP1
  399. // LO += a_L * b_L
  400. _vpclmulqdq $0x00, TMP0, TMP1, TMP2
  401. pxor TMP2, LO
  402. // b_L + b_H
  403. pshufd $0x4e, TMP1, TMP2
  404. pxor TMP1, TMP2
  405. // HI += a_H * b_H
  406. pclmulqdq $0x11, TMP0, TMP1
  407. pxor TMP1, GHASH_ACC
  408. // MI += (a_L + a_H) * (b_L + b_H)
  409. movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
  410. pclmulqdq $0x00, TMP1, TMP2
  411. pxor TMP2, MI
  412. .endm
  413. // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
  414. // _ghash_reduce, but it's hardcoded to use the registers of the main loop and
  415. // it uses the same register for HI and the destination. It's also divided into
  416. // two steps. TMP1 must be preserved across steps.
  417. //
  418. // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
  419. // shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
  420. // increase the critical path length, and it seems to slightly hurt performance.
  421. .macro _ghash_update_end_8x_step i
  422. .if \i == 0
  423. movq .Lgfpoly(%rip), TMP1
  424. pxor LO, MI
  425. pxor GHASH_ACC, MI
  426. pshufd $0x4e, LO, TMP2
  427. pclmulqdq $0x00, TMP1, LO
  428. pxor TMP2, MI
  429. pxor LO, MI
  430. .elseif \i == 1
  431. pshufd $0x4e, MI, TMP2
  432. pclmulqdq $0x00, TMP1, MI
  433. pxor TMP2, GHASH_ACC
  434. pxor MI, GHASH_ACC
  435. .endif
  436. .endm
  437. // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
  438. //
  439. // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
  440. // related fields in the key struct.
  441. .macro _aes_gcm_precompute
  442. // Function arguments
  443. .set KEY, %rdi
  444. // Additional local variables.
  445. // %xmm0-%xmm1 and %rax are used as temporaries.
  446. .set RNDKEYLAST_PTR, %rsi
  447. .set H_CUR, %xmm2
  448. .set H_POW1, %xmm3 // H^1
  449. .set H_POW1_X64, %xmm4 // H^1 * x^64
  450. .set GFPOLY, %xmm5
  451. // Encrypt an all-zeroes block to get the raw hash subkey.
  452. movl OFFSETOF_AESKEYLEN(KEY), %eax
  453. lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
  454. movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
  455. lea 16(KEY), %rax
  456. 1:
  457. aesenc (%rax), H_POW1
  458. add $16, %rax
  459. cmp %rax, RNDKEYLAST_PTR
  460. jne 1b
  461. aesenclast (RNDKEYLAST_PTR), H_POW1
  462. // Preprocess the raw hash subkey as needed to operate on GHASH's
  463. // bit-reflected values directly: reflect its bytes, then multiply it by
  464. // x^-1 (using the backwards interpretation of polynomial coefficients
  465. // from the GCM spec) or equivalently x^1 (using the alternative,
  466. // natural interpretation of polynomial coefficients).
  467. pshufb .Lbswap_mask(%rip), H_POW1
  468. movdqa H_POW1, %xmm0
  469. pshufd $0xd3, %xmm0, %xmm0
  470. psrad $31, %xmm0
  471. paddq H_POW1, H_POW1
  472. pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
  473. pxor %xmm0, H_POW1
  474. // Store H^1.
  475. movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
  476. // Compute and store H^1 * x^64.
  477. movq .Lgfpoly(%rip), GFPOLY
  478. pshufd $0x4e, H_POW1, %xmm0
  479. _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
  480. pxor %xmm0, H_POW1_X64
  481. movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
  482. // Compute and store the halves of H^1 XOR'd together.
  483. pxor H_POW1, %xmm0
  484. movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
  485. // Compute and store the remaining key powers H^2 through H^8.
  486. movdqa H_POW1, H_CUR
  487. mov $6*8, %eax
  488. .Lprecompute_next\@:
  489. // Compute H^i = H^{i-1} * H^1.
  490. _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
  491. // Store H^i.
  492. movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
  493. // Compute and store the halves of H^i XOR'd together.
  494. pshufd $0x4e, H_CUR, %xmm0
  495. pxor H_CUR, %xmm0
  496. movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
  497. sub $8, %eax
  498. jge .Lprecompute_next\@
  499. RET
  500. .endm
  501. // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
  502. // u8 ghash_acc[16], const u8 *aad, int aadlen);
  503. //
  504. // This function processes the AAD (Additional Authenticated Data) in GCM.
  505. // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
  506. // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
  507. // zeroes. |aadlen| must be a multiple of 16, except on the last call where it
  508. // can be any length. The caller must do any buffering needed to ensure this.
  509. .macro _aes_gcm_aad_update
  510. // Function arguments
  511. .set KEY, %rdi
  512. .set GHASH_ACC_PTR, %rsi
  513. .set AAD, %rdx
  514. .set AADLEN, %ecx
  515. // Note: _load_partial_block relies on AADLEN being in %ecx.
  516. // Additional local variables.
  517. // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
  518. .set BSWAP_MASK, %xmm2
  519. .set GHASH_ACC, %xmm3
  520. .set H_POW1, %xmm4 // H^1
  521. .set H_POW1_X64, %xmm5 // H^1 * x^64
  522. .set GFPOLY, %xmm6
  523. movdqa .Lbswap_mask(%rip), BSWAP_MASK
  524. movdqu (GHASH_ACC_PTR), GHASH_ACC
  525. movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
  526. movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
  527. movq .Lgfpoly(%rip), GFPOLY
  528. // Process the AAD one full block at a time.
  529. sub $16, AADLEN
  530. jl .Laad_loop_1x_done\@
  531. .Laad_loop_1x\@:
  532. movdqu (AAD), %xmm0
  533. pshufb BSWAP_MASK, %xmm0
  534. pxor %xmm0, GHASH_ACC
  535. _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
  536. add $16, AAD
  537. sub $16, AADLEN
  538. jge .Laad_loop_1x\@
  539. .Laad_loop_1x_done\@:
  540. // Check whether there is a partial block at the end.
  541. add $16, AADLEN
  542. jz .Laad_done\@
  543. // Process a partial block of length 1 <= AADLEN <= 15.
  544. // _load_partial_block assumes that %ecx contains AADLEN.
  545. _load_partial_block AAD, %xmm0, %r10, %r10d
  546. pshufb BSWAP_MASK, %xmm0
  547. pxor %xmm0, GHASH_ACC
  548. _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
  549. .Laad_done\@:
  550. movdqu GHASH_ACC, (GHASH_ACC_PTR)
  551. RET
  552. .endm
  553. // Increment LE_CTR eight times to generate eight little-endian counter blocks,
  554. // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
  555. // the zero-th AES round key. Clobbers TMP0 and TMP1.
  556. .macro _ctr_begin_8x
  557. movq .Lone(%rip), TMP0
  558. movdqa (KEY), TMP1 // zero-th round key
  559. .irp i, 0,1,2,3,4,5,6,7
  560. _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
  561. pxor TMP1, AESDATA\i
  562. paddd TMP0, LE_CTR
  563. .endr
  564. .endm
  565. // Do a non-last round of AES on AESDATA[0-7] using \round_key.
  566. .macro _aesenc_8x round_key
  567. .irp i, 0,1,2,3,4,5,6,7
  568. aesenc \round_key, AESDATA\i
  569. .endr
  570. .endm
  571. // Do the last round of AES on AESDATA[0-7] using \round_key.
  572. .macro _aesenclast_8x round_key
  573. .irp i, 0,1,2,3,4,5,6,7
  574. aesenclast \round_key, AESDATA\i
  575. .endr
  576. .endm
  577. // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
  578. // store the result to DST. Clobbers TMP0.
  579. .macro _xor_data_8x
  580. .irp i, 0,1,2,3,4,5,6,7
  581. _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
  582. .endr
  583. .irp i, 0,1,2,3,4,5,6,7
  584. movdqu AESDATA\i, \i*16(DST)
  585. .endr
  586. .endm
  587. // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
  588. // const u32 le_ctr[4], u8 ghash_acc[16],
  589. // const u8 *src, u8 *dst, int datalen);
  590. //
  591. // This macro generates a GCM encryption or decryption update function with the
  592. // above prototype (with \enc selecting which one).
  593. //
  594. // This function computes the next portion of the CTR keystream, XOR's it with
  595. // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
  596. // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
  597. // next |datalen| ciphertext bytes.
  598. //
  599. // |datalen| must be a multiple of 16, except on the last call where it can be
  600. // any length. The caller must do any buffering needed to ensure this. Both
  601. // in-place and out-of-place en/decryption are supported.
  602. //
  603. // |le_ctr| must give the current counter in little-endian format. For a new
  604. // message, the low word of the counter must be 2. This function loads the
  605. // counter from |le_ctr| and increments the loaded counter as needed, but it
  606. // does *not* store the updated counter back to |le_ctr|. The caller must
  607. // update |le_ctr| if any more data segments follow. Internally, only the low
  608. // 32-bit word of the counter is incremented, following the GCM standard.
  609. .macro _aes_gcm_update enc
  610. // Function arguments
  611. .set KEY, %rdi
  612. .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
  613. .set GHASH_ACC_PTR, %rdx
  614. .set SRC, %rcx
  615. .set DST, %r8
  616. .set DATALEN, %r9d
  617. .set DATALEN64, %r9 // Zero-extend DATALEN before using!
  618. // Note: the code setting up for _load_partial_block assumes that SRC is
  619. // in %rcx (and that DATALEN is *not* in %rcx).
  620. // Additional local variables
  621. // %rax and %rsi are used as temporary registers. Note: %rsi overlaps
  622. // with LE_CTR_PTR, which is used only at the beginning.
  623. .set AESKEYLEN, %r10d // AES key length in bytes
  624. .set AESKEYLEN64, %r10
  625. .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
  626. // Put the most frequently used values in %xmm0-%xmm7 to reduce code
  627. // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
  628. .set TMP0, %xmm0
  629. .set TMP1, %xmm1
  630. .set TMP2, %xmm2
  631. .set LO, %xmm3 // Low part of unreduced product
  632. .set MI, %xmm4 // Middle part of unreduced product
  633. .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
  634. // the high part of unreduced product
  635. .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
  636. .set LE_CTR, %xmm7 // Little-endian counter value
  637. .set AESDATA0, %xmm8
  638. .set AESDATA1, %xmm9
  639. .set AESDATA2, %xmm10
  640. .set AESDATA3, %xmm11
  641. .set AESDATA4, %xmm12
  642. .set AESDATA5, %xmm13
  643. .set AESDATA6, %xmm14
  644. .set AESDATA7, %xmm15
  645. movdqa .Lbswap_mask(%rip), BSWAP_MASK
  646. movdqu (GHASH_ACC_PTR), GHASH_ACC
  647. movdqu (LE_CTR_PTR), LE_CTR
  648. movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
  649. lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
  650. // If there are at least 8*16 bytes of data, then continue into the main
  651. // loop, which processes 8*16 bytes of data per iteration.
  652. //
  653. // The main loop interleaves AES and GHASH to improve performance on
  654. // CPUs that can execute these instructions in parallel. When
  655. // decrypting, the GHASH input (the ciphertext) is immediately
  656. // available. When encrypting, we instead encrypt a set of 8 blocks
  657. // first and then GHASH those blocks while encrypting the next set of 8,
  658. // repeat that as needed, and finally GHASH the last set of 8 blocks.
  659. //
  660. // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
  661. // as this makes the immediate fit in a signed byte, saving 3 bytes.
  662. add $-8*16, DATALEN
  663. jl .Lcrypt_loop_8x_done\@
  664. .if \enc
  665. // Encrypt the first 8 plaintext blocks.
  666. _ctr_begin_8x
  667. lea 16(KEY), %rsi
  668. .p2align 4
  669. 1:
  670. movdqa (%rsi), TMP0
  671. _aesenc_8x TMP0
  672. add $16, %rsi
  673. cmp %rsi, RNDKEYLAST_PTR
  674. jne 1b
  675. movdqa (%rsi), TMP0
  676. _aesenclast_8x TMP0
  677. _xor_data_8x
  678. // Don't increment DST until the ciphertext blocks have been hashed.
  679. sub $-8*16, SRC
  680. add $-8*16, DATALEN
  681. jl .Lghash_last_ciphertext_8x\@
  682. .endif
  683. .p2align 4
  684. .Lcrypt_loop_8x\@:
  685. // Generate the next set of 8 counter blocks and start encrypting them.
  686. _ctr_begin_8x
  687. lea 16(KEY), %rsi
  688. // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
  689. // by doing the unreduced multiplication for the first ciphertext block.
  690. movdqa (%rsi), TMP0
  691. add $16, %rsi
  692. _aesenc_8x TMP0
  693. _ghash_update_begin_8x \enc
  694. // Do 7 more rounds of AES, and continue the GHASH update by doing the
  695. // unreduced multiplication for the remaining ciphertext blocks.
  696. .p2align 4
  697. 1:
  698. movdqa (%rsi), TMP0
  699. add $16, %rsi
  700. _aesenc_8x TMP0
  701. _ghash_update_continue_8x \enc
  702. cmp $7*8, %eax
  703. jne 1b
  704. // Do the remaining AES rounds.
  705. .p2align 4
  706. 1:
  707. movdqa (%rsi), TMP0
  708. add $16, %rsi
  709. _aesenc_8x TMP0
  710. cmp %rsi, RNDKEYLAST_PTR
  711. jne 1b
  712. // Do the GHASH reduction and the last round of AES.
  713. movdqa (RNDKEYLAST_PTR), TMP0
  714. _ghash_update_end_8x_step 0
  715. _aesenclast_8x TMP0
  716. _ghash_update_end_8x_step 1
  717. // XOR the data with the AES-CTR keystream blocks.
  718. .if \enc
  719. sub $-8*16, DST
  720. .endif
  721. _xor_data_8x
  722. sub $-8*16, SRC
  723. .if !\enc
  724. sub $-8*16, DST
  725. .endif
  726. add $-8*16, DATALEN
  727. jge .Lcrypt_loop_8x\@
  728. .if \enc
  729. .Lghash_last_ciphertext_8x\@:
  730. // Update GHASH with the last set of 8 ciphertext blocks.
  731. _ghash_update_begin_8x \enc
  732. .p2align 4
  733. 1:
  734. _ghash_update_continue_8x \enc
  735. cmp $7*8, %eax
  736. jne 1b
  737. _ghash_update_end_8x_step 0
  738. _ghash_update_end_8x_step 1
  739. sub $-8*16, DST
  740. .endif
  741. .Lcrypt_loop_8x_done\@:
  742. sub $-8*16, DATALEN
  743. jz .Ldone\@
  744. // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
  745. // things simple and keep the code size down by just going one block at
  746. // a time, again taking advantage of hardware loop unrolling. Since
  747. // there are enough key powers available for all remaining data, we do
  748. // the GHASH multiplications unreduced, and only reduce at the very end.
  749. .set HI, TMP2
  750. .set H_POW, AESDATA0
  751. .set H_POW_XORED, AESDATA1
  752. .set ONE, AESDATA2
  753. movq .Lone(%rip), ONE
  754. // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
  755. pxor LO, LO
  756. pxor MI, MI
  757. pxor HI, HI
  758. // Set up a block counter %rax to contain 8*(8-n), where n is the number
  759. // of blocks that remain, counting any partial block. This will be used
  760. // to access the key powers H^n through H^1.
  761. mov DATALEN, %eax
  762. neg %eax
  763. and $~15, %eax
  764. sar $1, %eax
  765. add $64, %eax
  766. sub $16, DATALEN
  767. jl .Lcrypt_loop_1x_done\@
  768. // Process the data one full block at a time.
  769. .Lcrypt_loop_1x\@:
  770. // Encrypt the next counter block.
  771. _vpshufb BSWAP_MASK, LE_CTR, TMP0
  772. paddd ONE, LE_CTR
  773. pxor (KEY), TMP0
  774. lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
  775. cmp $24, AESKEYLEN
  776. jl 128f // AES-128?
  777. je 192f // AES-192?
  778. // AES-256
  779. aesenc -7*16(%rsi), TMP0
  780. aesenc -6*16(%rsi), TMP0
  781. 192:
  782. aesenc -5*16(%rsi), TMP0
  783. aesenc -4*16(%rsi), TMP0
  784. 128:
  785. .irp i, -3,-2,-1,0,1,2,3,4,5
  786. aesenc \i*16(%rsi), TMP0
  787. .endr
  788. aesenclast (RNDKEYLAST_PTR), TMP0
  789. // Load the next key power H^i.
  790. movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
  791. movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
  792. // XOR the keystream block that was just generated in TMP0 with the next
  793. // source data block and store the resulting en/decrypted data to DST.
  794. .if \enc
  795. _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
  796. movdqu TMP0, (DST)
  797. .else
  798. movdqu (SRC), TMP1
  799. pxor TMP1, TMP0
  800. movdqu TMP0, (DST)
  801. .endif
  802. // Update GHASH with the ciphertext block.
  803. .if \enc
  804. pshufb BSWAP_MASK, TMP0
  805. pxor TMP0, GHASH_ACC
  806. .else
  807. pshufb BSWAP_MASK, TMP1
  808. pxor TMP1, GHASH_ACC
  809. .endif
  810. _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
  811. pxor GHASH_ACC, GHASH_ACC
  812. add $8, %eax
  813. add $16, SRC
  814. add $16, DST
  815. sub $16, DATALEN
  816. jge .Lcrypt_loop_1x\@
  817. .Lcrypt_loop_1x_done\@:
  818. // Check whether there is a partial block at the end.
  819. add $16, DATALEN
  820. jz .Lghash_reduce\@
  821. // Process a partial block of length 1 <= DATALEN <= 15.
  822. // Encrypt a counter block for the last time.
  823. pshufb BSWAP_MASK, LE_CTR
  824. pxor (KEY), LE_CTR
  825. lea 16(KEY), %rsi
  826. 1:
  827. aesenc (%rsi), LE_CTR
  828. add $16, %rsi
  829. cmp %rsi, RNDKEYLAST_PTR
  830. jne 1b
  831. aesenclast (RNDKEYLAST_PTR), LE_CTR
  832. // Load the lowest key power, H^1.
  833. movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
  834. movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
  835. // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
  836. // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
  837. // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
  838. mov SRC, RNDKEYLAST_PTR
  839. mov DATALEN, %ecx
  840. _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
  841. // XOR the keystream block that was just generated in LE_CTR with the
  842. // source data block and store the resulting en/decrypted data to DST.
  843. pxor TMP0, LE_CTR
  844. mov DATALEN, %ecx
  845. _store_partial_block LE_CTR, DST
  846. // If encrypting, zero-pad the final ciphertext block for GHASH. (If
  847. // decrypting, this was already done by _load_partial_block.)
  848. .if \enc
  849. lea .Lzeropad_mask+16(%rip), %rax
  850. sub DATALEN64, %rax
  851. _vpand (%rax), LE_CTR, TMP0
  852. .endif
  853. // Update GHASH with the final ciphertext block.
  854. pshufb BSWAP_MASK, TMP0
  855. pxor TMP0, GHASH_ACC
  856. _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
  857. .Lghash_reduce\@:
  858. // Finally, do the GHASH reduction.
  859. _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
  860. .Ldone\@:
  861. // Store the updated GHASH accumulator back to memory.
  862. movdqu GHASH_ACC, (GHASH_ACC_PTR)
  863. RET
  864. .endm
  865. // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
  866. // const u32 le_ctr[4], u8 ghash_acc[16],
  867. // u64 total_aadlen, u64 total_datalen);
  868. // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
  869. // const u32 le_ctr[4], const u8 ghash_acc[16],
  870. // u64 total_aadlen, u64 total_datalen,
  871. // const u8 tag[16], int taglen);
  872. //
  873. // This macro generates one of the above two functions (with \enc selecting
  874. // which one). Both functions finish computing the GCM authentication tag by
  875. // updating GHASH with the lengths block and encrypting the GHASH accumulator.
  876. // |total_aadlen| and |total_datalen| must be the total length of the additional
  877. // authenticated data and the en/decrypted data in bytes, respectively.
  878. //
  879. // The encryption function then stores the full-length (16-byte) computed
  880. // authentication tag to |ghash_acc|. The decryption function instead loads the
  881. // expected authentication tag (the one that was transmitted) from the 16-byte
  882. // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
  883. // computed tag in constant time, and returns true if and only if they match.
  884. .macro _aes_gcm_final enc
  885. // Function arguments
  886. .set KEY, %rdi
  887. .set LE_CTR_PTR, %rsi
  888. .set GHASH_ACC_PTR, %rdx
  889. .set TOTAL_AADLEN, %rcx
  890. .set TOTAL_DATALEN, %r8
  891. .set TAG, %r9
  892. .set TAGLEN, %r10d // Originally at 8(%rsp)
  893. .set TAGLEN64, %r10
  894. // Additional local variables.
  895. // %rax and %xmm0-%xmm2 are used as temporary registers.
  896. .set AESKEYLEN, %r11d
  897. .set AESKEYLEN64, %r11
  898. .set BSWAP_MASK, %xmm3
  899. .set GHASH_ACC, %xmm4
  900. .set H_POW1, %xmm5 // H^1
  901. .set H_POW1_X64, %xmm6 // H^1 * x^64
  902. .set GFPOLY, %xmm7
  903. movdqa .Lbswap_mask(%rip), BSWAP_MASK
  904. movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
  905. // Set up a counter block with 1 in the low 32-bit word. This is the
  906. // counter that produces the ciphertext needed to encrypt the auth tag.
  907. movdqu (LE_CTR_PTR), %xmm0
  908. mov $1, %eax
  909. pinsrd $0, %eax, %xmm0
  910. // Build the lengths block and XOR it into the GHASH accumulator.
  911. movq TOTAL_DATALEN, GHASH_ACC
  912. pinsrq $1, TOTAL_AADLEN, GHASH_ACC
  913. psllq $3, GHASH_ACC // Bytes to bits
  914. _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
  915. movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
  916. movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
  917. movq .Lgfpoly(%rip), GFPOLY
  918. // Make %rax point to the 6th from last AES round key. (Using signed
  919. // byte offsets -7*16 through 6*16 decreases code size.)
  920. lea (KEY,AESKEYLEN64,4), %rax
  921. // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
  922. // Interleave the AES and GHASH instructions to improve performance.
  923. pshufb BSWAP_MASK, %xmm0
  924. pxor (KEY), %xmm0
  925. cmp $24, AESKEYLEN
  926. jl 128f // AES-128?
  927. je 192f // AES-192?
  928. // AES-256
  929. aesenc -7*16(%rax), %xmm0
  930. aesenc -6*16(%rax), %xmm0
  931. 192:
  932. aesenc -5*16(%rax), %xmm0
  933. aesenc -4*16(%rax), %xmm0
  934. 128:
  935. .irp i, 0,1,2,3,4,5,6,7,8
  936. aesenc (\i-3)*16(%rax), %xmm0
  937. _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
  938. .endr
  939. aesenclast 6*16(%rax), %xmm0
  940. _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
  941. // Undo the byte reflection of the GHASH accumulator.
  942. pshufb BSWAP_MASK, GHASH_ACC
  943. // Encrypt the GHASH accumulator.
  944. pxor %xmm0, GHASH_ACC
  945. .if \enc
  946. // Return the computed auth tag.
  947. movdqu GHASH_ACC, (GHASH_ACC_PTR)
  948. .else
  949. .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
  950. // Verify the auth tag in constant time by XOR'ing the transmitted and
  951. // computed auth tags together and using the ptest instruction to check
  952. // whether the first TAGLEN bytes of the result are zero.
  953. _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
  954. movl 8(%rsp), TAGLEN
  955. lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
  956. sub TAGLEN64, ZEROPAD_MASK_PTR
  957. xor %eax, %eax
  958. _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
  959. sete %al
  960. .endif
  961. RET
  962. .endm
  963. .set USE_AVX, 0
  964. SYM_FUNC_START(aes_gcm_precompute_aesni)
  965. _aes_gcm_precompute
  966. SYM_FUNC_END(aes_gcm_precompute_aesni)
  967. SYM_FUNC_START(aes_gcm_aad_update_aesni)
  968. _aes_gcm_aad_update
  969. SYM_FUNC_END(aes_gcm_aad_update_aesni)
  970. SYM_FUNC_START(aes_gcm_enc_update_aesni)
  971. _aes_gcm_update 1
  972. SYM_FUNC_END(aes_gcm_enc_update_aesni)
  973. SYM_FUNC_START(aes_gcm_dec_update_aesni)
  974. _aes_gcm_update 0
  975. SYM_FUNC_END(aes_gcm_dec_update_aesni)
  976. SYM_FUNC_START(aes_gcm_enc_final_aesni)
  977. _aes_gcm_final 1
  978. SYM_FUNC_END(aes_gcm_enc_final_aesni)
  979. SYM_FUNC_START(aes_gcm_dec_final_aesni)
  980. _aes_gcm_final 0
  981. SYM_FUNC_END(aes_gcm_dec_final_aesni)
  982. .set USE_AVX, 1
  983. SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
  984. _aes_gcm_precompute
  985. SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
  986. SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
  987. _aes_gcm_aad_update
  988. SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
  989. SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
  990. _aes_gcm_update 1
  991. SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
  992. SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
  993. _aes_gcm_update 0
  994. SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
  995. SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
  996. _aes_gcm_final 1
  997. SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
  998. SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
  999. _aes_gcm_final 0
  1000. SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)