Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/boringssl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Langley <agl@chromium.org>2014-06-20 23:00:00 +0400
committerAdam Langley <agl@chromium.org>2014-06-21 00:17:38 +0400
commitd81f801f6d393095c37b9df9b5e6757ba4570186 (patch)
treec40ac5d5dc2d53b747384f7739b1eeb9a44e67b0 /crypto/modes
parentb6333d600e0f54707cba962093ef3eca0312d6bc (diff)
ghash-x86[_64].pl: ~15% improvement on Atom Silvermont
(other processors unaffected). (Imported from upstream's 7078d93307d795cec577ec4a792b72fffed551ab)
Diffstat (limited to 'crypto/modes')
-rw-r--r--crypto/modes/asm/ghash-x86.pl16
-rw-r--r--crypto/modes/asm/ghash-x86_64.pl62
2 files changed, 46 insertions, 32 deletions
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
index d47e325a..eb6d55ee 100644
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_;
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
&movdqa ($Xhn,$Xn);
&pxor ($T1,$Xn); #
+ &lea ($inp,&DWP(32,$inp)); # i+=2
&pclmulqdq ($Xn,$Hkey,0x00); #######
&pclmulqdq ($Xhn,$Hkey,0x11); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&pclmulqdq ($T1,$T3,0x00); #######
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &nop ();
- &lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20);
&jbe (&label("even_tail"));
&jmp (&label("mod_loop"));
@@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_;
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
+ &nop ();
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
&pclmulqdq ($T2,$T3,0x10); #######
- &movdqa ($T3,&QWP(0,$const));
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &movdqa ($T3,&QWP(0,$const));
&xorps ($Xhi,$Xhn);
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pxor ($T1,$Xhi); #
- &pxor ($T2,$T1); #
&pshufb ($Xhn,$T3);
+ &pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
@@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_;
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &movups ($T3,&QWP(32,$Htbl));
&pclmulqdq ($Xn,$Hkey,0x00); #######
+ &movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
@@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_;
&movdqa ($T2,$Xi); # 2nd phase
&psrlq ($Xi,1);
&pxor ($T1,$Xhn);
+ &pxor ($Xhi,$T2); #
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &pxor ($Xhi,$T2); #
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 79042480..04001e6a 100644
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -214,6 +214,7 @@ ___
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
@@ -597,7 +598,8 @@ ___
}
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
@@ -624,7 +626,6 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
- mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
@@ -640,10 +641,16 @@ if ($do4xaggr) {
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
cmp \$0x30,$len
jb .Lskip4x
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
@@ -819,51 +826,54 @@ $code.=<<___;
pxor $T1,$Xi # Ii+Xi
movdqa $Xln,$Xhn
- pshufd \$0b01001110,$Xln,$T1
- pxor $Xln,$T1
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
- pclmulqdq \$0x00,$HK,$T1
+ pclmulqdq \$0x00,$HK,$Xmn
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$T2
+ pclmulqdq \$0x10,$HK,$Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
movdqu ($inp),$Xhn # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
pshufb $T3,$Xhn
movdqu 16($inp),$Xln # Ii+1
- pxor $Xi,$T1 # aggregated Karatsuba post-processing
pxor $Xhi,$T1
pxor $Xhn,$Xhi # "Ii+Xi", consume early
- pxor $T1,$T2
+ pxor $T1,$Xmn
pshufb $T3,$Xln
- movdqa $T2,$T1 #
+ movdqa $Xmn,$T1 #
psrldq \$8,$T1
- pslldq \$8,$T2 #
+ pslldq \$8,$Xmn #
pxor $T1,$Xhi
- pxor $T2,$Xi #
+ pxor $Xmn,$Xi #
movdqa $Xln,$Xhn #
movdqa $Xi,$T2 # 1st phase
movdqa $Xi,$T1
psllq \$5,$Xi
- pclmulqdq \$0x00,$Hkey,$Xln #######
pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$57,$Xi #
@@ -871,9 +881,9 @@ $code.=<<___;
pslldq \$8,$Xi
psrldq \$8,$T1 #
pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
- pshufd \$0b01001110,$Xhn,$T1
- pxor $Xhn,$T1 #
+ pxor $Xhn,$Xmn #
pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
@@ -882,33 +892,35 @@ $code.=<<___;
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
- pclmulqdq \$0x00,$HK,$T1 #######
+ pclmulqdq \$0x00,$HK,$Xmn #######
pxor $Xhi,$Xi #
+ .byte 0x66,0x90
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$T2
+ pclmulqdq \$0x10,$HK,$Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xi,$T1
pxor $Xhi,$T1
- pxor $T1,$T2
- movdqa $T2,$T1 #
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
psrldq \$8,$T1
- pslldq \$8,$T2 #
+ pslldq \$8,$Xmn #
pxor $T1,$Xhi
- pxor $T2,$Xi #
+ pxor $Xmn,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;