diff options
-rwxr-xr-x | crypto/bn/asm/x86-mont.pl | 41 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont.pl | 185 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont5.pl | 227 |
3 files changed, 274 insertions, 179 deletions
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 09296ec662..6787503666 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -73,27 +73,26 @@ $frame=32; # size of above frame rounded up to 16n &lea ("esi",&wparam(0)); # put aside pointer to argument block &lea ("edx",&wparam(1)); # load ap - &mov ("ebp","esp"); # saved stack pointer! &add ("edi",2); # extra two words on top of tp &neg ("edi"); - &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) &neg ("edi"); # minimize cache contention by arraning 2K window between stack # pointer and ap argument [np is also position sensitive vector, # but it's assumed to be near ap, as it's allocated at ~same # time]. - &mov ("eax","esp"); + &mov ("eax","ebp"); &sub ("eax","edx"); &and ("eax",2047); - &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 - &xor ("edx","esp"); + &xor ("edx","ebp"); &and ("edx",2048); &xor ("edx",2048); - &sub ("esp","edx"); # this splits them apart modulo 4096 + &sub ("ebp","edx"); # this splits them apart modulo 4096 - &and ("esp",-64); # align to cache line + &and ("ebp",-64); # align to cache line # An OS-agnostic version of __chkstk. # @@ -103,20 +102,28 @@ $frame=32; # size of above frame rounded up to 16n # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - &mov ("eax","ebp"); - &sub ("eax","esp"); + &mov ("eax","esp"); + &sub ("eax","ebp"); &and ("eax",-4096); -&set_label("page_walk"); - &mov ("edx",&DWP(0,"esp","eax")); - &sub ("eax",4096); - &data_byte(0x2e); - &jnc (&label("page_walk")); + &mov ("edx","esp"); # saved stack pointer! + &lea ("esp",&DWP(0,"ebp","eax")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); + &jmp (&label("page_walk_done")); + +&set_label("page_walk",16); + &lea ("esp",&DWP(-4096,"esp")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); +&set_label("page_walk_done"); ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp - &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 #&mov ("edi",&DWP(5*4,"esi"));# int num @@ -124,11 +131,11 @@ $frame=32; # size of above frame rounded up to 16n &mov ($_rp,"eax"); # ... save a copy of argument block &mov ($_ap,"ebx"); &mov ($_bp,"ecx"); - &mov ($_np,"edx"); + &mov ($_np,"ebp"); &mov ($_n0,"esi"); &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling #&mov ($_num,$num); # redundant as $num is not reused - &mov ($_sp,"ebp"); # saved stack pointer! + &mov ($_sp,"edx"); # saved stack pointer! if($sse2) { $acc0="mm0"; # mmx register bank layout diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 3a2511f7f2..0451fef12e 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -104,6 +104,8 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: + mov ${num}d,${num}d + mov %rsp,%rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d @@ -128,15 +130,12 @@ $code.=<<___; push %r14 push %r15 - mov ${num}d,${num}d - lea 2($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) - and \$-1024,%rsp # minimize TLB usage + lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul_body: # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to @@ -145,14 +144,24 @@ $code.=<<___; # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - sub %rsp,%r11 + sub %r10,%r11 and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 .Lmul_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x66,0x2e # predict non-taken - jnc .Lmul_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: mov $bp,%r12 # reassign $bp ___ $bp="%r12"; @@ -323,13 +332,13 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul_epilogue: ret .size bn_mul_mont,.-bn_mul_mont @@ -341,6 +350,8 @@ $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: + mov ${num}d,${num}d + mov %rsp,%rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -356,23 +367,29 @@ $code.=<<___; push %r14 push %r15 - mov ${num}d,${num}d - lea 4($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) - and \$-1024,%rsp # minimize TLB usage + lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) + neg $num # restore + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul4x_body: - sub %rsp,%r11 + sub %r10,%r11 and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmul4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ @@ -751,13 +768,13 @@ ___ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul4x_epilogue: ret .size bn_mul4x_mont,.-bn_mul4x_mont @@ -787,14 +804,15 @@ $code.=<<___; .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: mov %rsp,%rax +.Lsqr8x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes @@ -807,33 +825,42 @@ bn_sqr8x_mont: # do its job. # lea -64(%rsp,$num,2),%r11 + mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lsqr8x_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 .Lsqr8x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lsqr8x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: mov $num,%r10 neg $num @@ -957,30 +984,38 @@ $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: -.Lmulx4x_enter: mov %rsp,%rax +.Lmulx4x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes - .byte 0x67 xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 - lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) - and \$-128,%rsp - mov %rax,%r11 - sub %rsp,%r11 + lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) + and \$-128,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 .Lmulx4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x66,0x2e # predict non-taken - jnc .Lmulx4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: lea ($bp,$num),%r10 ############################################################## @@ -1341,22 +1376,8 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - jmp .Lcommon_seh_tail + jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent @@ -1384,15 +1405,21 @@ sqr_handler: cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # body label + cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -1479,13 +1506,15 @@ $code.=<<___; .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] + .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +.align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 ___ } diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 2a7972d610..3278dc6056 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -93,6 +93,8 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: + mov ${num}d,${num}d + mov %rsp,%rax test \$7,${num}d jnz .Lmul_enter ___ @@ -104,10 +106,7 @@ $code.=<<___; .align 16 .Lmul_enter: - mov ${num}d,${num}d - mov %rsp,%rax movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument - lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 @@ -115,13 +114,12 @@ $code.=<<___; push %r14 push %r15 - lea 2($num),%r11 - neg %r11 - lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) - and \$-1024,%rsp # minimize TLB usage + neg $num + mov %rsp,%r11 + lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul_body: # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to @@ -130,13 +128,24 @@ $code.=<<___; # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - sub %rsp,%rax - and \$-4096,%rax + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + .Lmul_page_walk: - mov (%rsp,%rax),%r11 - sub \$4096,%rax - .byte 0x2e # predict non-taken - jnc .Lmul_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + lea .Linc(%rip),%r10 + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) ___ @@ -442,6 +451,8 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: + .byte 0x67 + mov %rsp,%rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -450,14 +461,13 @@ $code.=<<___ if ($addx); je .Lmulx4x_enter ___ $code.=<<___; - .byte 0x67 - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmul4x_prologue: .byte 0x67 shl \$3,${num}d # convert $num to bytes @@ -474,32 +484,40 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $rp - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $rp + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmul4xsp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmul4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: neg $num @@ -1043,6 +1061,7 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: + mov %rsp,%rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -1051,13 +1070,13 @@ $code.=<<___ if ($addx); je .Lpowerx5_enter ___ $code.=<<___; - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lpower5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num @@ -1072,32 +1091,40 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwr_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + .Lpwr_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lpwr_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: mov $num,%r10 neg $num @@ -2037,6 +2064,7 @@ bn_from_mont8x: push %r13 push %r14 push %r15 +.Lfrom_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2051,32 +2079,40 @@ bn_from_mont8x: # last operation, we use the opportunity to cleanse it. # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lfrom_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + .Lfrom_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lfrom_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: mov $num,%r10 neg $num @@ -2182,14 +2218,15 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: -.Lmulx4x_enter: mov %rsp,%rax +.Lmulx4x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2206,31 +2243,39 @@ bn_mulx4x_mont_gather5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmulx4xsp_done: - and \$-64,%rsp # ensure alignment - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp # ensure alignment + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + .Lmulx4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmulx4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: ############################################################## # Stack layout @@ -2638,14 +2683,15 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: -.Lpowerx5_enter: mov %rsp,%rax +.Lpowerx5_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2660,32 +2706,40 @@ bn_powerx5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwrx_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + .Lpwrx_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lpwrx_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: mov $num,%r10 neg $num @@ -3616,9 +3670,14 @@ mul_handler: cmp %r10,%rbx # context->Rip<end of prologue label jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail @@ -3630,11 +3689,11 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - jmp .Lbody_proceed + jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer -.Lbody_proceed: +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3725,34 +3784,34 @@ $code.=<<___; .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] + .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] + .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler - .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] + .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] + .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 |