diff options
Diffstat (limited to 'engines/asm')
-rw-r--r-- | engines/asm/e_padlock-x86.pl | 104 | ||||
-rw-r--r-- | engines/asm/e_padlock-x86_64.pl | 178 |
2 files changed, 204 insertions, 78 deletions
diff --git a/engines/asm/e_padlock-x86.pl b/engines/asm/e_padlock-x86.pl index 1b2ba52253..4148468c41 100644 --- a/engines/asm/e_padlock-x86.pl +++ b/engines/asm/e_padlock-x86.pl @@ -37,7 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; @@ -188,10 +188,6 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); - if ($PADLOCK_MARGIN{$mode}) { - &cmp ($len,$PADLOCK_MARGIN{$mode}); - &jbe (&label("${mode}_short")); - } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); @@ -212,7 +208,27 @@ my ($mode,$opcode) = @_; &neg ("eax"); &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK &lea ("esp",&DWP(0,"eax","ebp")); # alloca + &mov ("eax",$PADLOCK_CHUNK); + &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK + &mov ("eax","ebp"); + &and ("ebp",-16); &and ("esp",-16); + &mov (&DWP(16,"ebp"),"eax"); + if ($PADLOCK_PREFETCH{$mode}) { + &cmp ($len,$chunk); + &ja (&label("${mode}_loop")); + &mov ("eax",$inp); # check if prefetch crosses page + &cmp ("ebp","esp"); + &cmove ("eax",$out); + &add ("eax",$len); + &neg ("eax"); + &and ("eax",0xfff); # distance to page boundary + &cmp ("eax",$PADLOCK_PREFETCH{$mode}); + &mov ("eax",-$PADLOCK_PREFETCH{$mode}); + &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 + &and ($chunk,"eax"); + &jz (&label("${mode}_unaligned_tail")); + } &jmp (&label("${mode}_loop")); &set_label("${mode}_loop",16); @@ -276,8 +292,8 @@ my ($mode,$opcode) = @_; &test ($out,0x0f); &jz (&label("${mode}_out_aligned")); &mov ($len,$chunk); - &shr ($len,2); &lea ($inp,&DWP(0,"esp")); + &shr ($len,2); &data_byte(0xf3,0xa5); # rep movsl &sub ($out,$chunk); &set_label("${mode}_out_aligned"); @@ -288,7 +304,30 @@ my ($mode,$opcode) = @_; &add ($inp,$chunk); &sub ($len,$chunk); &mov ($chunk,$PADLOCK_CHUNK); + if (!$PADLOCK_PREFETCH{$mode}) { &jnz (&label("${mode}_loop")); + } else { + &jz (&label("${mode}_break")); + &cmp ($len,$chunk); + &jae (&label("${mode}_loop")); + +&set_label("${mode}_unaligned_tail"); + &xor ("eax","eax"); + &cmp ("esp","ebp"); + &cmove ("eax",$len); + &sub ("esp","eax"); # alloca + &mov ("eax", $out); # save parameters + &mov ($chunk,$len); + &shr ($len,2); + &lea ($out,&DWP(0,"esp")); + &data_byte(0xf3,0xa5); # rep movsl + &mov ($inp,"esp"); + &mov ($out,"eax"); # restore parameters + &mov ($len,$chunk); + &jmp (&label("${mode}_loop")); + +&set_label("${mode}_break",16); + } if ($mode ne "ctr32") { &cmp ("esp","ebp"); &je (&label("${mode}_done")); @@ -302,28 +341,24 @@ my ($mode,$opcode) = @_; &ja (&label("${mode}_bzero")); &set_label("${mode}_done"); + &mov ("ebp",&DWP(16,"ebp")); &lea ("esp",&DWP(24,"ebp")); if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); -&set_label("${mode}_short",16); - &xor ("eax","eax"); - &lea ("ebp",&DWP(-24,"esp")); - &sub ("eax",$len); - &lea ("esp",&DWP(0,"eax","ebp")); - &and ("esp",-16); - &xor ($chunk,$chunk); -&set_label("${mode}_short_copy"); - &movups ("xmm0",&QWP(0,$inp,$chunk)); - &lea ($chunk,&DWP(16,$chunk)); - &cmp ($len,$chunk); - &movaps (&QWP(-16,"esp",$chunk),"xmm0"); - &ja (&label("${mode}_short_copy")); - &mov ($inp,"esp"); - &mov ($chunk,$len); - &jmp (&label("${mode}_loop")); - &set_label("${mode}_aligned",16); + if ($PADLOCK_PREFETCH{$mode}) { + &lea ("ebp",&DWP(0,$inp,$len)); + &neg ("ebp"); + &and ("ebp",0xfff); # distance to page boundary + &xor ("eax","eax"); + &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); + &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); + &cmovae ("ebp","eax"); + &and ("ebp",$len); # remainder + &sub ($len,"ebp"); + &jz (&label("${mode}_aligned_tail")); + } &lea ("eax",&DWP(-16,$ctx)); # ivp &lea ("ebx",&DWP(16,$ctx)); # key &shr ($len,4); # len/=AES_BLOCK_SIZE @@ -332,6 +367,29 @@ my ($mode,$opcode) = @_; &movaps ("xmm0",&QWP(0,"eax")); &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv } + if ($PADLOCK_PREFETCH{$mode}) { + &test ("ebp","ebp"); + &jz (&label("${mode}_exit")); + +&set_label("${mode}_aligned_tail"); + &mov ($len,"ebp"); + &lea ("ebp",&DWP(-24,"esp")); + &mov ("esp","ebp"); + &mov ("eax","ebp"); + &sub ("esp",$len); + &and ("ebp",-16); + &and ("esp",-16); + &mov (&DWP(16,"ebp"),"eax"); + &mov ("eax", $out); # save parameters + &mov ($chunk,$len); + &shr ($len,2); + &lea ($out,&DWP(0,"esp")); + &data_byte(0xf3,0xa5); # rep movsl + &mov ($inp,"esp"); + &mov ($out,"eax"); # restore parameters + &mov ($len,$chunk); + &jmp (&label("${mode}_loop")); + } &set_label("${mode}_exit"); } &mov ("eax",1); &lea ("esp",&DWP(4,"esp")); # popf diff --git a/engines/asm/e_padlock-x86_64.pl b/engines/asm/e_padlock-x86_64.pl index 5091c7aaca..297561a61b 100644 --- a/engines/asm/e_padlock-x86_64.pl +++ b/engines/asm/e_padlock-x86_64.pl @@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; $code=".text\n"; -%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; @@ -285,17 +285,6 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx -___ -# Formally speaking correct condtion is $len<=$margin and $inp+$margin -# crosses page boundary [and next page is unreadable]. But $inp can -# be unaligned in which case data can be copied to $out if latter is -# aligned, in which case $out+$margin has to be checked. Covering all -# cases appears more complicated than just copying short input... -$code.=<<___ if ($PADLOCK_MARGIN{$mode}); - cmp \$$PADLOCK_MARGIN{$mode},$len - jbe .L${mode}_short -___ -$code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out @@ -315,6 +304,8 @@ $code.=<<___; neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp + mov \$$PADLOCK_CHUNK,%rax + cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: @@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32"); bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax - jz .L${mode}_loop + mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax + cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK + cmovbe $len,$chunk +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + cmp $chunk,$len + ja .L${mode}_loop + mov $inp,%rax # check if prefetch crosses page + cmp %rsp,%rbp + cmove $out,%rax + add $len,%rax + neg %rax + and \$0xfff,%rax # distance to page boundary + cmp \$$PADLOCK_PREFETCH{$mode},%rax + mov \$-$PADLOCK_PREFETCH{$mode},%rax + cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 + and %rax,$chunk + jz .L${mode}_unaligned_tail ___ $code.=<<___; jmp .L${mode}_loop @@ -360,12 +368,12 @@ ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter test \$0xffff0000,%eax - jnz .L${mode}_no_corr + jnz .L${mode}_no_carry bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) -.L${mode}_no_corr: +.L${mode}_no_carry: ___ $code.=<<___; mov %r8,$out # restore paramters @@ -373,8 +381,8 @@ $code.=<<___; test \$0x0f,$out jz .L${mode}_out_aligned mov $chunk,$len - shr \$3,$len lea (%rsp),$inp + shr \$3,$len .byte 0xf3,0x48,0xa5 # rep movsq sub $chunk,$out .L${mode}_out_aligned: @@ -384,9 +392,52 @@ $code.=<<___; add $chunk,$inp sub $chunk,$len mov \$$PADLOCK_CHUNK,$chunk +___ + if (!$PADLOCK_PREFETCH{$mode}) { +$code.=<<___; jnz .L${mode}_loop - +___ + } else { +$code.=<<___; + jz .L${mode}_break + cmp $chunk,$len + jae .L${mode}_loop +___ +$code.=<<___ if ($mode eq "ctr32"); + mov $len,$chunk + mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp + cmove $out,%rax + add $len,%rax + neg %rax + and \$0xfff,%rax # distance to page boundary + cmp \$$PADLOCK_PREFETCH{$mode},%rax + mov \$-$PADLOCK_PREFETCH{$mode},%rax + cmovae $chunk,%rax + and %rax,$chunk + jnz .L${mode}_loop +___ +$code.=<<___; +.L${mode}_unaligned_tail: + xor %eax,%eax + cmp %rsp,%rbp + cmove $len,%rax + mov $out,%r8 # save parameters + mov $len,$chunk + sub %rax,%rsp # alloca + shr \$3,$len + lea (%rsp),$out + .byte 0xf3,0x48,0xa5 # rep movsq + mov %rsp,$inp + mov %r8, $out # restore parameters + mov $chunk,$len + jmp .L${mode}_loop +.align 16 +.L${mode}_break: +___ + } +$code.=<<___; + cmp %rbp,%rsp je .L${mode}_done pxor %xmm0,%xmm0 @@ -400,70 +451,87 @@ $code.=<<___; .L${mode}_done: lea (%rbp),%rsp jmp .L${mode}_exit -___ -$code.=<<___ if ($PADLOCK_MARGIN{$mode}); -.align 16 -.L${mode}_short: - mov %rsp,%rbp - sub $len,%rsp - xor $chunk,$chunk -.L${mode}_short_copy: - movups ($inp,$chunk),%xmm0 - lea 16($chunk),$chunk - cmp $chunk,$len - movaps %xmm0,-16(%rsp,$chunk) - ja .L${mode}_short_copy - mov %rsp,$inp - mov $len,$chunk - jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"` -___ -$code.=<<___; + .align 16 .L${mode}_aligned: ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter - mov \$`16*0x10000`,$chunk bswap %eax - cmp $len,$chunk - cmova $len,$chunk neg %eax and \$0xffff,%eax - jz .L${mode}_aligned_loop + mov \$`16*0x10000`,$chunk shl \$4,%eax + cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross 2^16 - jmp .L${mode}_aligned_loop -.align 16 + cmovbe $len,$chunk + jbe .L${mode}_aligned_skip + .L${mode}_aligned_loop: - cmp $len,$chunk - cmova $len,$chunk mov $len,%r10 # save parameters mov $chunk,$len mov $chunk,%r11 -___ -$code.=<<___; + lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* -___ -$code.=<<___ if ($mode !~ /ecb|ctr/); - movdqa (%rax),%xmm0 - movdqa %xmm0,-16($ctx) # copy [or refresh] iv -___ -$code.=<<___ if ($mode eq "ctr32"); + mov -4($ctx),%eax # pull 32-bit counter bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) - mov %r11,$chunk # restore paramters - mov %r10,$len - sub $chunk,$len + mov %r10,$len # restore paramters + sub %r11,$len mov \$`16*0x10000`,$chunk - jnz .L${mode}_aligned_loop + jz .L${mode}_exit + cmp $chunk,$len + jae .L${mode}_aligned_loop + +.L${mode}_aligned_skip: +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + lea ($inp,$len),%rbp + neg %rbp + and \$0xfff,%rbp # distance to page boundary + xor %eax,%eax + cmp \$$PADLOCK_PREFETCH{$mode},%rbp + mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp + cmovae %rax,%rbp + and $len,%rbp # remainder + sub %rbp,$len + jz .L${mode}_aligned_tail +___ +$code.=<<___; + lea -16($ctx),%rax # ivp + lea 16($ctx),%rbx # key + shr \$4,$len # len/=AES_BLOCK_SIZE + .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* +___ +$code.=<<___ if ($mode !~ /ecb|ctr/); + movdqa (%rax),%xmm0 + movdqa %xmm0,-16($ctx) # copy [or refresh] iv +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + test %rbp,%rbp # check remainder + jz .L${mode}_exit + +.L${mode}_aligned_tail: + mov $out,%r8 + mov %rbp,$chunk + mov %rbp,$len + lea (%rsp),%rbp + sub $len,%rsp + shr \$3,$len + lea (%rsp),$out + .byte 0xf3,0x48,0xa5 # rep movsq + lea (%r8),$out + lea (%rsp),$inp + mov $chunk,$len + jmp .L${mode}_loop ___ $code.=<<___; .L${mode}_exit: |