aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2016-03-16 23:33:53 +0100
committerAndy Polyakov <appro@openssl.org>2016-08-22 14:58:32 +0200
commit3ba1ef829cf3dd36eaa5e819258d90291c6a1027 (patch)
tree55d682acc16b649f0edef4a320b658e5eb3b2ea0 /crypto/bn
parentfe34735c194b9d81dd84e89bbfcfc67c90aa99b3 (diff)
downloadopenssl-3ba1ef829cf3dd36eaa5e819258d90291c6a1027.tar.gz
bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.
Original strategy for page-walking was adjust stack pointer and then touch pages in order. This kind of asks for double-fault, because if touch fails, then signal will be delivered to frame above adjusted stack pointer. But touching pages prior adjusting stack pointer would upset valgrind. As compromise let's adjust stack pointer in pages, touching top of the stack. This still asks for double-fault, but at least prevents corruption of neighbour stack if allocation is to overstep the guard page. Also omit predict-non-taken hints as they reportedly trigger illegal instructions in some VM setups. Reviewed-by: Richard Levitte <levitte@openssl.org>
Diffstat (limited to 'crypto/bn')
-rwxr-xr-xcrypto/bn/asm/x86-mont.pl41
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl185
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl227
3 files changed, 274 insertions, 179 deletions
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 09296ec662..6787503666 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -73,27 +73,26 @@ $frame=32; # size of above frame rounded up to 16n
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
- &mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
- &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
+ &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
- &mov ("eax","esp");
+ &mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
- &sub ("esp","eax"); # this aligns sp and ap modulo 2048
+ &sub ("ebp","eax"); # this aligns sp and ap modulo 2048
- &xor ("edx","esp");
+ &xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
- &sub ("esp","edx"); # this splits them apart modulo 4096
+ &sub ("ebp","edx"); # this splits them apart modulo 4096
- &and ("esp",-64); # align to cache line
+ &and ("ebp",-64); # align to cache line
# An OS-agnostic version of __chkstk.
#
@@ -103,20 +102,28 @@ $frame=32; # size of above frame rounded up to 16n
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- &mov ("eax","ebp");
- &sub ("eax","esp");
+ &mov ("eax","esp");
+ &sub ("eax","ebp");
&and ("eax",-4096);
-&set_label("page_walk");
- &mov ("edx",&DWP(0,"esp","eax"));
- &sub ("eax",4096);
- &data_byte(0x2e);
- &jnc (&label("page_walk"));
+ &mov ("edx","esp"); # saved stack pointer!
+ &lea ("esp",&DWP(0,"ebp","eax"));
+ &mov ("eax",&DWP(0,"esp"));
+ &cmp ("esp","ebp");
+ &ja (&label("page_walk"));
+ &jmp (&label("page_walk_done"));
+
+&set_label("page_walk",16);
+ &lea ("esp",&DWP(-4096,"esp"));
+ &mov ("eax",&DWP(0,"esp"));
+ &cmp ("esp","ebp");
+ &ja (&label("page_walk"));
+&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
- &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+ &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
@@ -124,11 +131,11 @@ $frame=32; # size of above frame rounded up to 16n
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
- &mov ($_np,"edx");
+ &mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
- &mov ($_sp,"ebp"); # saved stack pointer!
+ &mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 3a2511f7f2..0451fef12e 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -104,6 +104,8 @@ $code=<<___;
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
@@ -128,15 +130,12 @@ $code.=<<___;
push %r14
push %r15
- mov ${num}d,${num}d
- lea 2($num),%r10
+ neg $num
mov %rsp,%r11
- neg %r10
- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
- and \$-1024,%rsp # minimize TLB usage
+ lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul_body:
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
@@ -145,14 +144,24 @@ $code.=<<___;
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- sub %rsp,%r11
+ sub %r10,%r11
and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
.Lmul_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x66,0x2e # predict non-taken
- jnc .Lmul_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
@@ -323,13 +332,13 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lmul_epilogue:
ret
.size bn_mul_mont,.-bn_mul_mont
@@ -341,6 +350,8 @@ $code.=<<___;
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -356,23 +367,29 @@ $code.=<<___;
push %r14
push %r15
- mov ${num}d,${num}d
- lea 4($num),%r10
+ neg $num
mov %rsp,%r11
- neg %r10
- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
- and \$-1024,%rsp # minimize TLB usage
+ lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
+ neg $num # restore
+ and \$-1024,%r10 # minimize TLB usage
- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul4x_body:
- sub %rsp,%r11
+ sub %r10,%r11
and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
.Lmul4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmul4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
@@ -751,13 +768,13 @@ ___
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lmul4x_epilogue:
ret
.size bn_mul4x_mont,.-bn_mul4x_mont
@@ -787,14 +804,15 @@ $code.=<<___;
.type bn_sqr8x_mont,\@function,6
.align 32
bn_sqr8x_mont:
-.Lsqr8x_enter:
mov %rsp,%rax
+.Lsqr8x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lsqr8x_prologue:
mov ${num}d,%r10d
shl \$3,${num}d # convert $num to bytes
@@ -807,33 +825,42 @@ bn_sqr8x_mont:
# do its job.
#
lea -64(%rsp,$num,2),%r11
+ mov %rsp,%rbp
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r11,%rbp # align with $aptr
+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lsqr8x_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
.Lsqr8x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lsqr8x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
mov $num,%r10
neg $num
@@ -957,30 +984,38 @@ $code.=<<___;
.type bn_mulx4x_mont,\@function,6
.align 32
bn_mulx4x_mont:
-.Lmulx4x_enter:
mov %rsp,%rax
+.Lmulx4x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
- .byte 0x67
xor %r10,%r10
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
- lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
- and \$-128,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
+ and \$-128,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
.Lmulx4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x66,0x2e # predict non-taken
- jnc .Lmulx4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
lea ($bp,$num),%r10
##############################################################
@@ -1341,22 +1376,8 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
- jmp .Lcommon_seh_tail
+ jmp .Lcommon_pop_regs
.size mul_handler,.-mul_handler
.type sqr_handler,\@abi-omnipotent
@@ -1384,15 +1405,21 @@ sqr_handler:
cmp %r10,%rbx # context->Rip<.Lsqr_body
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # body label
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
jae .Lcommon_seh_tail
mov 40(%rax),%rax # pull saved stack pointer
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -1479,13 +1506,15 @@ $code.=<<___;
.LSEH_info_bn_sqr8x_mont:
.byte 9,0,0,0
.rva sqr_handler
- .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+ .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+.align 8
___
$code.=<<___ if ($addx);
.LSEH_info_bn_mulx4x_mont:
.byte 9,0,0,0
.rva sqr_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+.align 8
___
}
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 2a7972d610..3278dc6056 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -93,6 +93,8 @@ $code=<<___;
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
test \$7,${num}d
jnz .Lmul_enter
___
@@ -104,10 +106,7 @@ $code.=<<___;
.align 16
.Lmul_enter:
- mov ${num}d,${num}d
- mov %rsp,%rax
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
- lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
@@ -115,13 +114,12 @@ $code.=<<___;
push %r14
push %r15
- lea 2($num),%r11
- neg %r11
- lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
- and \$-1024,%rsp # minimize TLB usage
+ neg $num
+ mov %rsp,%r11
+ lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul_body:
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
@@ -130,13 +128,24 @@ $code.=<<___;
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- sub %rsp,%rax
- and \$-4096,%rax
+ sub %r10,%r11
+ and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
.Lmul_page_walk:
- mov (%rsp,%rax),%r11
- sub \$4096,%rax
- .byte 0x2e # predict non-taken
- jnc .Lmul_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ lea .Linc(%rip),%r10
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
@@ -442,6 +451,8 @@ $code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
+ .byte 0x67
+ mov %rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -450,14 +461,13 @@ $code.=<<___ if ($addx);
je .Lmulx4x_enter
___
$code.=<<___;
- .byte 0x67
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lmul4x_prologue:
.byte 0x67
shl \$3,${num}d # convert $num to bytes
@@ -474,32 +484,40 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $rp
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $rp
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lmul4xsp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
.Lmul4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmul4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
neg $num
@@ -1043,6 +1061,7 @@ $code.=<<___;
.type bn_power5,\@function,6
.align 32
bn_power5:
+ mov %rsp,%rax
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1051,13 +1070,13 @@ $code.=<<___ if ($addx);
je .Lpowerx5_enter
___
$code.=<<___;
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lpower5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
@@ -1072,32 +1091,40 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwr_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
.Lpwr_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lpwr_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
mov $num,%r10
neg $num
@@ -2037,6 +2064,7 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
+.Lfrom_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2051,32 +2079,40 @@ bn_from_mont8x:
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lfrom_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+ jmp .Lfrom_page_walk_done
+
.Lfrom_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lfrom_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+.Lfrom_page_walk_done:
mov $num,%r10
neg $num
@@ -2182,14 +2218,15 @@ $code.=<<___;
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
mov %rsp,%rax
+.Lmulx4x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2206,31 +2243,39 @@ bn_mulx4x_mont_gather5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lmulx4xsp_done:
- and \$-64,%rsp # ensure alignment
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp # ensure alignment
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
.Lmulx4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmulx4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
##############################################################
# Stack layout
@@ -2638,14 +2683,15 @@ $code.=<<___;
.type bn_powerx5,\@function,6
.align 32
bn_powerx5:
-.Lpowerx5_enter:
mov %rsp,%rax
+.Lpowerx5_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lpowerx5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2660,32 +2706,40 @@ bn_powerx5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwrx_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
.Lpwrx_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lpwrx_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
mov $num,%r10
neg $num
@@ -3616,9 +3670,14 @@ mul_handler:
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
@@ -3630,11 +3689,11 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- jmp .Lbody_proceed
+ jmp .Lcommon_pop_regs
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3725,34 +3784,34 @@ $code.=<<___;
.LSEH_info_bn_mul_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+ .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_mul4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+ .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_power5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
+ .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+ .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
.LSEH_info_bn_mulx4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_powerx5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
+ .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
___
$code.=<<___;
.align 8