Mirror of BoringSSL (grpc依赖)
https://boringssl.googlesource.com/boringssl
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
288 lines
9.7 KiB
288 lines
9.7 KiB
#!/usr/bin/env perl |
|
# Copyright (c) 2019, Google Inc. |
|
# |
|
# Permission to use, copy, modify, and/or distribute this software for any |
|
# purpose with or without fee is hereby granted, provided that the above |
|
# copyright notice and this permission notice appear in all copies. |
|
# |
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
|
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
|
|
# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit |
|
# table-based GHASH implementation. It requires SSSE3 instructions. |
|
# |
|
# For background, the table-based strategy is a 4-bit windowed multiplication. |
|
# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops |
|
# over 4-bit windows of the input and indexes them up into the table. Visually, |
|
# it multiplies as in the schoolbook multiplication diagram below, but with |
|
# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First |
|
# it incorporates the terms labeled '1' by indexing the most significant term |
|
# of X into the table. Then it shifts and repeats for '2' and so on. |
|
# |
|
# hhhhhh |
|
# * xxxxxx |
|
# ============ |
|
# 666666 |
|
# 555555 |
|
# 444444 |
|
# 333333 |
|
# 222222 |
|
# 111111 |
|
# |
|
# This implementation changes the order. We treat the table as a 16×16 matrix |
|
# and transpose it. The first row is then the first byte of each multiple of H, |
|
# and so on. We then reorder terms as below. Observe that the terms labeled '1' |
|
# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 |
|
# pshufb instruction, using alternating terms of X in parallel as indices. This |
|
# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and |
|
# repeat for each row. |
|
# |
|
# hhhhhh |
|
# * xxxxxx |
|
# ============ |
|
# 224466 |
|
# 113355 |
|
# 224466 |
|
# 113355 |
|
# 224466 |
|
# 113355 |
|
# |
|
# Next we account for GCM's confusing bit order. The "first" bit is the least |
|
# significant coefficient, but GCM treats the most sigificant bit within a byte |
|
# as first. Bytes are little-endian, and bits are big-endian. We reverse the |
|
# bytes in XMM registers for a consistent bit and byte ordering, but this means |
|
# the least significant bit is the most significant coefficient and vice versa. |
|
# |
|
# For consistency, "low", "high", "left-shift", and "right-shift" refer to the |
|
# bit ordering within the XMM register, rather than the reversed coefficient |
|
# ordering. Low bits are less significant bits and more significant |
|
# coefficients. Right-shifts move from MSB to the LSB and correspond to |
|
# increasing the power of each coefficient. |
|
# |
|
# Note this bit reversal enters into the table's column indices. H*1 is stored |
|
# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier |
|
# table rows contain more significant coefficients, so we iterate forwards. |
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
|
push(@INC,"${dir}","${dir}../../../perlasm"); |
|
require "x86asm.pl"; |
|
|
|
$output = pop; |
|
open STDOUT, ">$output"; |
|
|
|
&asm_init($ARGV[0]); |
|
|
|
my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx"); |
|
&static_label("reverse_bytes"); |
|
&static_label("low4_mask"); |
|
|
|
my $call_counter = 0; |
|
# process_rows emits assembly code to process $rows rows of the table. On |
|
# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the |
|
# low and high halves of the input. The result so far is passed in xmm2. xmm3 |
|
# must be zero. On output, $Htable is advanced to the next row and xmm2 is |
|
# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6. |
|
sub process_rows { |
|
my ($rows) = @_; |
|
$call_counter++; |
|
|
|
# Shifting whole XMM registers by bits is complex. psrldq shifts by |
|
# bytes, and psrlq shifts the two 64-bit halves separately. Each row |
|
# produces 8 bits of carry, and the reduction needs an additional 7-bit |
|
# shift. This must fit in 64 bits so reduction can use psrlq. This |
|
# allows up to 7 rows at a time. |
|
die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); |
|
|
|
&mov("eax", $rows); |
|
&set_label("loop_row_$call_counter"); |
|
&movdqa("xmm4", &QWP(0, $Htable)); |
|
&lea($Htable, &DWP(16, $Htable)); |
|
|
|
# Right-shift xmm2 and xmm3 by 8 bytes. |
|
&movdqa("xmm6", "xmm2"); |
|
&palignr("xmm6", "xmm3", 1); |
|
&movdqa("xmm3", "xmm6"); |
|
&psrldq("xmm2", 1); |
|
|
|
# Load the next table row and index the low and high bits of the input. |
|
# Note the low (respectively, high) half corresponds to more |
|
# (respectively, less) significant coefficients. |
|
&movdqa("xmm5", "xmm4"); |
|
&pshufb("xmm4", "xmm0"); |
|
&pshufb("xmm5", "xmm1"); |
|
|
|
# Add the high half (xmm5) without shifting. |
|
&pxor("xmm2", "xmm5"); |
|
|
|
# Add the low half (xmm4). This must be right-shifted by 4 bits. First, |
|
# add into the carry register (xmm3). |
|
&movdqa("xmm5", "xmm4"); |
|
&psllq("xmm5", 60); |
|
&movdqa("xmm6", "xmm5"); |
|
&pslldq("xmm6", 8); |
|
&pxor("xmm3", "xmm6"); |
|
|
|
# Next, add into xmm2. |
|
&psrldq("xmm5", 8); |
|
&pxor("xmm2", "xmm5"); |
|
&psrlq("xmm4", 4); |
|
&pxor("xmm2", "xmm4"); |
|
|
|
&sub("eax", 1); |
|
&jnz(&label("loop_row_$call_counter")); |
|
|
|
# Reduce the carry register. The reduction polynomial is 1 + x + x^2 + |
|
# x^7, so we shift and XOR four times. |
|
&pxor("xmm2", "xmm3"); # x^0 = 0 |
|
&psrlq("xmm3", 1); |
|
&pxor("xmm2", "xmm3"); # x^1 = x |
|
&psrlq("xmm3", 1); |
|
&pxor("xmm2", "xmm3"); # x^(1+1) = x^2 |
|
&psrlq("xmm3", 5); |
|
&pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7 |
|
&pxor("xmm3", "xmm3"); |
|
____ |
|
} |
|
|
|
# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. |
|
# |Xi| is represented in GHASH's serialized byte representation. |Htable| is |
|
# formatted as described above. |
|
# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); |
|
&function_begin("gcm_gmult_ssse3"); |
|
&mov($Xi, &wparam(0)); |
|
&mov($Htable, &wparam(1)); |
|
|
|
&movdqu("xmm0", &QWP(0, $Xi)); |
|
&call(&label("pic_point")); |
|
&set_label("pic_point"); |
|
&blindpop("eax"); |
|
&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax")); |
|
&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax")); |
|
|
|
# Reverse input bytes to deserialize. |
|
&pshufb("xmm0", "xmm7"); |
|
|
|
# Split each byte into low (xmm0) and high (xmm1) halves. |
|
&movdqa("xmm1", "xmm2"); |
|
&pandn("xmm1", "xmm0"); |
|
&psrld("xmm1", 4); |
|
&pand("xmm0", "xmm2"); |
|
|
|
# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note |
|
# that, due to bit reversal, xmm3 contains bits that fall off when |
|
# right-shifting, not left-shifting. |
|
&pxor("xmm2", "xmm2"); |
|
&pxor("xmm3", "xmm3"); |
|
|
|
# We must reduce at least once every 7 rows, so divide into three |
|
# chunks. |
|
&process_rows(5); |
|
&process_rows(5); |
|
&process_rows(6); |
|
|
|
# Store the result. Reverse bytes to serialize. |
|
&pshufb("xmm2", "xmm7"); |
|
&movdqu(&QWP(0, $Xi), "xmm2"); |
|
|
|
# Zero any registers which contain secrets. |
|
&pxor("xmm0", "xmm0"); |
|
&pxor("xmm1", "xmm1"); |
|
&pxor("xmm2", "xmm2"); |
|
&pxor("xmm3", "xmm3"); |
|
&pxor("xmm4", "xmm4"); |
|
&pxor("xmm5", "xmm5"); |
|
&pxor("xmm6", "xmm6"); |
|
&function_end("gcm_gmult_ssse3"); |
|
|
|
# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as |
|
# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's |
|
# serialized byte representation. |Htable| is formatted as described above. |
|
# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, |
|
# size_t len); |
|
&function_begin("gcm_ghash_ssse3"); |
|
&mov($Xi, &wparam(0)); |
|
&mov($Htable, &wparam(1)); |
|
&mov($in, &wparam(2)); |
|
&mov($len, &wparam(3)); |
|
|
|
&movdqu("xmm0", &QWP(0, $Xi)); |
|
&call(&label("pic_point")); |
|
&set_label("pic_point"); |
|
&blindpop("ebx"); |
|
&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx")); |
|
|
|
# This function only processes whole blocks. |
|
&and($len, -16); |
|
|
|
# Reverse input bytes to deserialize. We maintain the running |
|
# total in xmm0. |
|
&pshufb("xmm0", "xmm7"); |
|
|
|
# Iterate over each block. On entry to each iteration, xmm3 is zero. |
|
&pxor("xmm3", "xmm3"); |
|
&set_label("loop_ghash"); |
|
&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx")); |
|
|
|
# Incorporate the next block of input. |
|
&movdqu("xmm1", &QWP(0, $in)); |
|
&pshufb("xmm1", "xmm7"); # Reverse bytes. |
|
&pxor("xmm0", "xmm1"); |
|
|
|
# Split each byte into low (xmm0) and high (xmm1) halves. |
|
&movdqa("xmm1", "xmm2"); |
|
&pandn("xmm1", "xmm0"); |
|
&psrld("xmm1", 4); |
|
&pand("xmm0", "xmm2"); |
|
|
|
# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note |
|
# that, due to bit reversal, xmm3 contains bits that fall off when |
|
# right-shifting, not left-shifting. |
|
&pxor("xmm2", "xmm2"); |
|
# xmm3 is already zero at this point. |
|
|
|
# We must reduce at least once every 7 rows, so divide into three |
|
# chunks. |
|
&process_rows(5); |
|
&process_rows(5); |
|
&process_rows(6); |
|
|
|
&movdqa("xmm0", "xmm2"); |
|
|
|
# Rewind $Htable for the next iteration. |
|
&lea($Htable, &DWP(-256, $Htable)); |
|
|
|
# Advance input and continue. |
|
&lea($in, &DWP(16, $in)); |
|
&sub($len, 16); |
|
&jnz(&label("loop_ghash")); |
|
|
|
# Reverse bytes and store the result. |
|
&pshufb("xmm0", "xmm7"); |
|
&movdqu(&QWP(0, $Xi), "xmm0"); |
|
|
|
# Zero any registers which contain secrets. |
|
&pxor("xmm0", "xmm0"); |
|
&pxor("xmm1", "xmm1"); |
|
&pxor("xmm2", "xmm2"); |
|
&pxor("xmm3", "xmm3"); |
|
&pxor("xmm4", "xmm4"); |
|
&pxor("xmm5", "xmm5"); |
|
&pxor("xmm6", "xmm6"); |
|
&function_end("gcm_ghash_ssse3"); |
|
|
|
# reverse_bytes is a permutation which, if applied with pshufb, reverses the |
|
# bytes in an XMM register. |
|
&set_label("reverse_bytes", 16); |
|
&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
|
# low4_mask is an XMM mask which selects the low four bits of each byte. |
|
&set_label("low4_mask", 16); |
|
&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); |
|
|
|
&asm_finish(); |
|
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
|