aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2016-09-09 17:19:58 +0200
committerAndy Polyakov <appro@openssl.org>2016-10-24 20:00:33 +0200
commit68f6d2a02c8cc30c5c737fc948b7cf023a234b47 (patch)
tree0c8c0fc5d2fc2887935f6b20f5b157823d10265b /crypto/bn
parent0310becc82d240288a4ab5c6656c10c18cab4454 (diff)
downloadopenssl-68f6d2a02c8cc30c5c737fc948b7cf023a234b47.tar.gz
bn/asm/ppc-mont.pl: add optimized multiplication and squaring subroutines.
Reviewed-by: Rich Salz <rsalz@openssl.org>
Diffstat (limited to 'crypto/bn')
-rw-r--r--crypto/bn/asm/ppc-mont.pl1646
1 files changed, 1645 insertions, 1 deletions
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index 8676567cc2..ce0b061a7d 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -31,6 +31,16 @@
# 2048-bit +18%
# 4096-bit +4%
+# September 2016
+#
+# Add multiplication procedure operating on lengths divisible by 4
+# and squaring procedure operating on lengths divisible by 8. Length
+# is expressed in number of limbs. RSA private key operations are
+# ~35-50% faster (more for longer keys) on contemporary high-end POWER
+# processors in 64-bit builds, [mysterously enough] more in 32-bit
+# builds. On low-end 32-bit processors performance improvement turned
+# to be marginal...
+
$flavour = shift;
if ($flavour =~ /32/) {
@@ -50,6 +60,7 @@ if ($flavour =~ /32/) {
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
+ $SHLI= "slwi"; # unsigned shift left by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
@@ -70,6 +81,7 @@ if ($flavour =~ /32/) {
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
+ $SHLI= "sldi"; # unsigned shift left by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }
@@ -120,7 +132,7 @@ $code=<<___;
.text
.globl .bn_mul_mont_int
-.align 4
+.align 5
.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
@@ -338,6 +350,1638 @@ Lcopy: ; copy or in-place refresh
.size .bn_mul_mont_int,.-.bn_mul_mont_int
___
}
+if (1) {
+my ($a0,$a1,$a2,$a3,
+ $t0,$t1,$t2,$t3,
+ $m0,$m1,$m2,$m3,
+ $acc0,$acc1,$acc2,$acc3,$acc4,
+ $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
+my ($carry,$zero) = ($rp,"r0");
+
+# sp----------->+-------------------------------+
+# | saved sp |
+# +-------------------------------+
+# . .
+# +8*size_t +-------------------------------+
+# | 4 "n0*t0" |
+# . .
+# . .
+# +12*size_t +-------------------------------+
+# | size_t tmp[num] |
+# . .
+# . .
+# . .
+# +-------------------------------+
+# | topmost carry |
+# . .
+# -18*size_t +-------------------------------+
+# | 18 saved gpr, r14-r31 |
+# . .
+# . .
+# +-------------------------------+
+$code.=<<___;
+.globl .bn_mul4x_mont_int
+.align 5
+.bn_mul4x_mont_int:
+ andi. r0,$num,7
+ bne .Lmul4x_do
+ $UCMP $ap,$bp
+ bne .Lmul4x_do
+ b .Lsqr8x_do
+.Lmul4x_do:
+ slwi $num,$num,`log($SIZE_T)/log(2)`
+ mr $a0,$sp
+ li $a1,-32*$SIZE_T
+ sub $a1,$a1,$num
+ $STUX $sp,$sp,$a1 # alloca
+
+ $PUSH r14,-$SIZE_T*18($a0)
+ $PUSH r15,-$SIZE_T*17($a0)
+ $PUSH r16,-$SIZE_T*16($a0)
+ $PUSH r17,-$SIZE_T*15($a0)
+ $PUSH r18,-$SIZE_T*14($a0)
+ $PUSH r19,-$SIZE_T*13($a0)
+ $PUSH r20,-$SIZE_T*12($a0)
+ $PUSH r21,-$SIZE_T*11($a0)
+ $PUSH r22,-$SIZE_T*10($a0)
+ $PUSH r23,-$SIZE_T*9($a0)
+ $PUSH r24,-$SIZE_T*8($a0)
+ $PUSH r25,-$SIZE_T*7($a0)
+ $PUSH r26,-$SIZE_T*6($a0)
+ $PUSH r27,-$SIZE_T*5($a0)
+ $PUSH r28,-$SIZE_T*4($a0)
+ $PUSH r29,-$SIZE_T*3($a0)
+ $PUSH r30,-$SIZE_T*2($a0)
+ $PUSH r31,-$SIZE_T*1($a0)
+
+ subi $ap,$ap,$SIZE_T # bias by -1
+ subi $np,$np,$SIZE_T # bias by -1
+ subi $rp,$rp,$SIZE_T # bias by -1
+ $LD $n0,0($n0) # *n0
+
+ add $t0,$bp,$num
+ add $ap_end,$ap,$num
+ subi $t0,$t0,$SIZE_T*4 # &b[num-4]
+
+ $LD $bi,$SIZE_T*0($bp) # b[0]
+ li $acc0,0
+ $LD $a0,$SIZE_T*1($ap) # a[0..3]
+ li $acc1,0
+ $LD $a1,$SIZE_T*2($ap)
+ li $acc2,0
+ $LD $a2,$SIZE_T*3($ap)
+ li $acc3,0
+ $LDU $a3,$SIZE_T*4($ap)
+ $LD $m0,$SIZE_T*1($np) # n[0..3]
+ $LD $m1,$SIZE_T*2($np)
+ $LD $m2,$SIZE_T*3($np)
+ $LDU $m3,$SIZE_T*4($np)
+
+ $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4]
+ $PUSH $t0,$SIZE_T*7($sp)
+ li $carry,0
+ addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
+ li $cnt,0
+ li $zero,0
+ b .Loop_mul4x_1st_reduction
+
+.align 5
+.Loop_mul4x_1st_reduction:
+ $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0])
+ addze $carry,$carry # modulo-scheduled
+ $UMULL $t1,$a1,$bi
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$bi
+ andi. $cnt,$cnt,$SIZE_T*4-1
+ $UMULL $t3,$a3,$bi
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0])
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$a1,$bi
+ adde $acc2,$acc2,$t2
+ $UMULL $mi,$acc0,$n0 # t[0]*n0
+ adde $acc3,$acc3,$t3
+ $UMULH $t2,$a2,$bi
+ addze $acc4,$zero
+ $UMULH $t3,$a3,$bi
+ $LDX $bi,$bp,$cnt # next b[i] (or b[0])
+ addc $acc1,$acc1,$t0
+ # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0)
+ $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
+ adde $acc2,$acc2,$t1
+ $UMULL $t1,$m1,$mi
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$m2,$mi
+ adde $acc4,$acc4,$t3 # can't overflow
+ $UMULL $t3,$m3,$mi
+ # (*) addc $acc0,$acc0,$t0
+ # (*) As for removal of first multiplication and addition
+ # instructions. The outcome of first addition is
+ # guaranteed to be zero, which leaves two computationally
+ # significant outcomes: it either carries or not. Then
+ # question is when does it carry? Is there alternative
+ # way to deduce it? If you follow operations, you can
+ # observe that condition for carry is quite simple:
+ # $acc0 being non-zero. So that carry can be calculated
+ # by adding -1 to $acc0. That's what next instruction does.
+ addic $acc0,$acc0,-1 # (*), discarded
+ $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0)
+ adde $acc0,$acc1,$t1
+ $UMULH $t1,$m1,$mi
+ adde $acc1,$acc2,$t2
+ $UMULH $t2,$m2,$mi
+ adde $acc2,$acc3,$t3
+ $UMULH $t3,$m3,$mi
+ adde $acc3,$acc4,$carry
+ addze $carry,$zero
+ addc $acc0,$acc0,$t0
+ adde $acc1,$acc1,$t1
+ adde $acc2,$acc2,$t2
+ adde $acc3,$acc3,$t3
+ #addze $carry,$carry
+ bne .Loop_mul4x_1st_reduction
+
+ $UCMP $ap_end,$ap
+ beq .Lmul4x4_post_condition
+
+ $LD $a0,$SIZE_T*1($ap) # a[4..7]
+ $LD $a1,$SIZE_T*2($ap)
+ $LD $a2,$SIZE_T*3($ap)
+ $LDU $a3,$SIZE_T*4($ap)
+ $LD $mi,$SIZE_T*8($sp) # a[0]*n0
+ $LD $m0,$SIZE_T*1($np) # n[4..7]
+ $LD $m1,$SIZE_T*2($np)
+ $LD $m2,$SIZE_T*3($np)
+ $LDU $m3,$SIZE_T*4($np)
+ b .Loop_mul4x_1st_tail
+
+.align 5
+.Loop_mul4x_1st_tail:
+ $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i])
+ addze $carry,$carry # modulo-scheduled
+ $UMULL $t1,$a1,$bi
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$bi
+ andi. $cnt,$cnt,$SIZE_T*4-1
+ $UMULL $t3,$a3,$bi
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i])
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$a1,$bi
+ adde $acc2,$acc2,$t2
+ $UMULH $t2,$a2,$bi
+ adde $acc3,$acc3,$t3
+ $UMULH $t3,$a3,$bi
+ addze $acc4,$zero
+ $LDX $bi,$bp,$cnt # next b[i] (or b[0])
+ addc $acc1,$acc1,$t0
+ $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0)
+ adde $acc2,$acc2,$t1
+ $UMULL $t1,$m1,$mi
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$m2,$mi
+ adde $acc4,$acc4,$t3 # can't overflow
+ $UMULL $t3,$m3,$mi
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0)
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$m1,$mi
+ adde $acc2,$acc2,$t2
+ $UMULH $t2,$m2,$mi
+ adde $acc3,$acc3,$t3
+ adde $acc4,$acc4,$carry
+ $UMULH $t3,$m3,$mi
+ addze $carry,$zero
+ addi $mi,$sp,$SIZE_T*8
+ $LDX $mi,$mi,$cnt # next t[0]*n0
+ $STU $acc0,$SIZE_T($tp) # word of result
+ addc $acc0,$acc1,$t0
+ adde $acc1,$acc2,$t1
+ adde $acc2,$acc3,$t2
+ adde $acc3,$acc4,$t3
+ #addze $carry,$carry
+ bne .Loop_mul4x_1st_tail
+
+ sub $t1,$ap_end,$num # rewinded $ap
+ $UCMP $ap_end,$ap # done yet?
+ beq .Lmul4x_proceed
+
+ $LD $a0,$SIZE_T*1($ap)
+ $LD $a1,$SIZE_T*2($ap)
+ $LD $a2,$SIZE_T*3($ap)
+ $LDU $a3,$SIZE_T*4($ap)
+ $LD $m0,$SIZE_T*1($np)
+ $LD $m1,$SIZE_T*2($np)
+ $LD $m2,$SIZE_T*3($np)
+ $LDU $m3,$SIZE_T*4($np)
+ b .Loop_mul4x_1st_tail
+
+.align 5
+.Lmul4x_proceed:
+ $LDU $bi,$SIZE_T*4($bp) # *++b
+ addze $carry,$carry # topmost carry
+ $LD $a0,$SIZE_T*1($t1)
+ $LD $a1,$SIZE_T*2($t1)
+ $LD $a2,$SIZE_T*3($t1)
+ $LD $a3,$SIZE_T*4($t1)
+ addi $ap,$t1,$SIZE_T*4
+ sub $np,$np,$num # rewind np
+
+ $ST $acc0,$SIZE_T*1($tp) # result
+ $ST $acc1,$SIZE_T*2($tp)
+ $ST $acc2,$SIZE_T*3($tp)
+ $ST $acc3,$SIZE_T*4($tp)
+ $ST $carry,$SIZE_T*5($tp) # save topmost carry
+ $LD $acc0,$SIZE_T*12($sp) # t[0..3]
+ $LD $acc1,$SIZE_T*13($sp)
+ $LD $acc2,$SIZE_T*14($sp)
+ $LD $acc3,$SIZE_T*15($sp)
+
+ $LD $m0,$SIZE_T*1($np) # n[0..3]
+ $LD $m1,$SIZE_T*2($np)
+ $LD $m2,$SIZE_T*3($np)
+ $LDU $m3,$SIZE_T*4($np)
+ addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
+ li $carry,0
+ b .Loop_mul4x_reduction
+
+.align 5
+.Loop_mul4x_reduction:
+ $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4])
+ addze $carry,$carry # modulo-scheduled
+ $UMULL $t1,$a1,$bi
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$bi
+ andi. $cnt,$cnt,$SIZE_T*4-1
+ $UMULL $t3,$a3,$bi
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4])
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$a1,$bi
+ adde $acc2,$acc2,$t2
+ $UMULL $mi,$acc0,$n0 # t[0]*n0
+ adde $acc3,$acc3,$t3
+ $UMULH $t2,$a2,$bi
+ addze $acc4,$zero
+ $UMULH $t3,$a3,$bi
+ $LDX $bi,$bp,$cnt # next b[i]
+ addc $acc1,$acc1,$t0
+ # (*) mul $t0,$m0,$mi
+ $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
+ adde $acc2,$acc2,$t1
+ $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$m2,$mi
+ adde $acc4,$acc4,$t3 # can't overflow
+ $UMULL $t3,$m3,$mi
+ # (*) addc $acc0,$acc0,$t0
+ addic $acc0,$acc0,-1 # (*), discarded
+ $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0
+ adde $acc0,$acc1,$t1
+ $UMULH $t1,$m1,$mi
+ adde $acc1,$acc2,$t2
+ $UMULH $t2,$m2,$mi
+ adde $acc2,$acc3,$t3
+ $UMULH $t3,$m3,$mi
+ adde $acc3,$acc4,$carry
+ addze $carry,$zero
+ addc $acc0,$acc0,$t0
+ adde $acc1,$acc1,$t1
+ adde $acc2,$acc2,$t2
+ adde $acc3,$acc3,$t3
+ #addze $carry,$carry
+ bne .Loop_mul4x_reduction
+
+ $LD $t0,$SIZE_T*5($tp) # t[4..7]
+ addze $carry,$carry
+ $LD $t1,$SIZE_T*6($tp)
+ $LD $t2,$SIZE_T*7($tp)
+ $LD $t3,$SIZE_T*8($tp)
+ $LD $a0,$SIZE_T*1($ap) # a[4..7]
+ $LD $a1,$SIZE_T*2($ap)
+ $LD $a2,$SIZE_T*3($ap)
+ $LDU $a3,$SIZE_T*4($ap)
+ addc $acc0,$acc0,$t0
+ adde $acc1,$acc1,$t1
+ adde $acc2,$acc2,$t2
+ adde $acc3,$acc3,$t3
+ #addze $carry,$carry
+
+ $LD $mi,$SIZE_T*8($sp) # t[0]*n0
+ $LD $m0,$SIZE_T*1($np) # n[4..7]
+ $LD $m1,$SIZE_T*2($np)
+ $LD $m2,$SIZE_T*3($np)
+ $LDU $m3,$SIZE_T*4($np)
+ b .Loop_mul4x_tail
+
+.align 5
+.Loop_mul4x_tail:
+ $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4])
+ addze $carry,$carry # modulo-scheduled
+ $UMULL $t1,$a1,$bi
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$bi
+ andi. $cnt,$cnt,$SIZE_T*4-1
+ $UMULL $t3,$a3,$bi
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4])
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$a1,$bi
+ adde $acc2,$acc2,$t2
+ $UMULH $t2,$a2,$bi
+ adde $acc3,$acc3,$t3
+ $UMULH $t3,$a3,$bi
+ addze $acc4,$zero
+ $LDX $bi,$bp,$cnt # next b[i]
+ addc $acc1,$acc1,$t0
+ $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0)
+ adde $acc2,$acc2,$t1
+ $UMULL $t1,$m1,$mi
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$m2,$mi
+ adde $acc4,$acc4,$t3 # can't overflow
+ $UMULL $t3,$m3,$mi
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0)
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$m1,$mi
+ adde $acc2,$acc2,$t2
+ $UMULH $t2,$m2,$mi
+ adde $acc3,$acc3,$t3
+ $UMULH $t3,$m3,$mi
+ adde $acc4,$acc4,$carry
+ addi $mi,$sp,$SIZE_T*8
+ $LDX $mi,$mi,$cnt # next a[0]*n0
+ addze $carry,$zero
+ $STU $acc0,$SIZE_T($tp) # word of result
+ addc $acc0,$acc1,$t0
+ adde $acc1,$acc2,$t1
+ adde $acc2,$acc3,$t2
+ adde $acc3,$acc4,$t3
+ #addze $carry,$carry
+ bne .Loop_mul4x_tail
+
+ $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry
+ sub $t1,$np,$num # rewinded np?
+ addze $carry,$carry
+ $UCMP $ap_end,$ap # done yet?
+ beq .Loop_mul4x_break
+
+ $LD $t1,$SIZE_T*6($tp)
+ $LD $t2,$SIZE_T*7($tp)
+ $LD $t3,$SIZE_T*8($tp)
+ $LD $a0,$SIZE_T*1($ap)
+ $LD $a1,$SIZE_T*2($ap)
+ $LD $a2,$SIZE_T*3($ap)
+ $LDU $a3,$SIZE_T*4($ap)
+ addc $acc0,$acc0,$t0
+ adde $acc1,$acc1,$t1
+ adde $acc2,$acc2,$t2
+ adde $acc3,$acc3,$t3
+ #addze $carry,$carry
+
+ $LD $m0,$SIZE_T*1($np) # n[4..7]
+ $LD $m1,$SIZE_T*2($np)
+ $LD $m2,$SIZE_T*3($np)
+ $LDU $m3,$SIZE_T*4($np)
+ b .Loop_mul4x_tail
+
+.align 5
+.Loop_mul4x_break:
+ $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4]
+ $POP $t3,$SIZE_T*7($sp)
+ addc $a0,$acc0,$t0 # accumulate topmost carry
+ $LD $acc0,$SIZE_T*12($sp) # t[0..3]
+ addze $a1,$acc1
+ $LD $acc1,$SIZE_T*13($sp)
+ addze $a2,$acc2
+ $LD $acc2,$SIZE_T*14($sp)
+ addze $a3,$acc3
+ $LD $acc3,$SIZE_T*15($sp)
+ addze $carry,$carry # topmost carry
+ $ST $a0,$SIZE_T*1($tp) # result
+ sub $ap,$ap_end,$num # rewind ap
+ $ST $a1,$SIZE_T*2($tp)
+ $ST $a2,$SIZE_T*3($tp)
+ $ST $a3,$SIZE_T*4($tp)
+ $ST $carry,$SIZE_T*5($tp) # store topmost carry
+
+ $LD $m0,$SIZE_T*1($t1) # n[0..3]
+ $LD $m1,$SIZE_T*2($t1)
+ $LD $m2,$SIZE_T*3($t1)
+ $LD $m3,$SIZE_T*4($t1)
+ addi $np,$t1,$SIZE_T*4
+ $UCMP $bp,$t3 # done yet?
+ beq .Lmul4x_post
+
+ $LDU $bi,$SIZE_T*4($bp)
+ $LD $a0,$SIZE_T*1($ap) # a[0..3]
+ $LD $a1,$SIZE_T*2($ap)
+ $LD $a2,$SIZE_T*3($ap)
+ $LDU $a3,$SIZE_T*4($ap)
+ li $carry,0
+ addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
+ b .Loop_mul4x_reduction
+
+.align 5
+.Lmul4x_post:
+ # Final step. We see if result is larger than modulus, and
+ # if it is, subtract the modulus. But comparison implies
+ # subtraction. So we subtract modulus, see if it borrowed,
+ # and conditionally copy original value.
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
+ mr $bp,$t2 # &rp[-1]
+ subi $cnt,$cnt,1
+ mr $ap_end,$t2 # &rp[-1] copy
+ subfc $t0,$m0,$acc0
+ addi $tp,$sp,$SIZE_T*15
+ subfe $t1,$m1,$acc1
+
+ mtctr $cnt
+.Lmul4x_sub:
+ $LD $m0,$SIZE_T*1($np)
+ $LD $acc0,$SIZE_T*1($tp)
+ subfe $t2,$m2,$acc2
+ $LD $m1,$SIZE_T*2($np)
+ $LD $acc1,$SIZE_T*2($tp)
+ subfe $t3,$m3,$acc3
+ $LD $m2,$SIZE_T*3($np)
+ $LD $acc2,$SIZE_T*3($tp)
+ $LDU $m3,$SIZE_T*4($np)
+ $LDU $acc3,$SIZE_T*4($tp)
+ $ST $t0,$SIZE_T*1($bp)
+ $ST $t1,$SIZE_T*2($bp)
+ subfe $t0,$m0,$acc0
+ $ST $t2,$SIZE_T*3($bp)
+ $STU $t3,$SIZE_T*4($bp)
+ subfe $t1,$m1,$acc1
+ bdnz .Lmul4x_sub
+
+ $LD $a0,$SIZE_T*1($ap_end)
+ $ST $t0,$SIZE_T*1($bp)
+ $LD $t0,$SIZE_T*12($sp)
+ subfe $t2,$m2,$acc2
+ $LD $a1,$SIZE_T*2($ap_end)
+ $ST $t1,$SIZE_T*2($bp)
+ $LD $t1,$SIZE_T*13($sp)
+ subfe $t3,$m3,$acc3
+ subfe $carry,$zero,$carry # did it borrow?
+ addi $tp,$sp,$SIZE_T*12
+ $LD $a2,$SIZE_T*3($ap_end)
+ $ST $t2,$SIZE_T*3($bp)
+ $LD $t2,$SIZE_T*14($sp)
+ $LD $a3,$SIZE_T*4($ap_end)
+ $ST $t3,$SIZE_T*4($bp)
+ $LD $t3,$SIZE_T*15($sp)
+
+ mtctr $cnt
+.Lmul4x_cond_copy:
+ and $t0,$t0,$carry
+ andc $a0,$a0,$carry
+ $ST $zero,$SIZE_T*0($tp) # wipe stack clean
+ and $t1,$t1,$carry
+ andc $a1,$a1,$carry
+ $ST $zero,$SIZE_T*1($tp)
+ and $t2,$t2,$carry
+ andc $a2,$a2,$carry
+ $ST $zero,$SIZE_T*2($tp)
+ and $t3,$t3,$carry
+ andc $a3,$a3,$carry
+ $ST $zero,$SIZE_T*3($tp)
+ or $acc0,$t0,$a0
+ $LD $a0,$SIZE_T*5($ap_end)
+ $LD $t0,$SIZE_T*4($tp)
+ or $acc1,$t1,$a1
+ $LD $a1,$SIZE_T*6($ap_end)
+ $LD $t1,$SIZE_T*5($tp)
+ or $acc2,$t2,$a2
+ $LD $a2,$SIZE_T*7($ap_end)
+ $LD $t2,$SIZE_T*6($tp)
+ or $acc3,$t3,$a3
+ $LD $a3,$SIZE_T*8($ap_end)
+ $LD $t3,$SIZE_T*7($tp)
+ addi $tp,$tp,$SIZE_T*4
+ $ST $acc0,$SIZE_T*1($ap_end)
+ $ST $acc1,$SIZE_T*2($ap_end)
+ $ST $acc2,$SIZE_T*3($ap_end)
+ $STU $acc3,$SIZE_T*4($ap_end)
+ bdnz .Lmul4x_cond_copy
+
+ $POP $bp,0($sp) # pull saved sp
+ and $t0,$t0,$carry
+ andc $a0,$a0,$carry
+ $ST $zero,$SIZE_T*0($tp)
+ and $t1,$t1,$carry
+ andc $a1,$a1,$carry
+ $ST $zero,$SIZE_T*1($tp)
+ and $t2,$t2,$carry
+ andc $a2,$a2,$carry
+ $ST $zero,$SIZE_T*2($tp)
+ and $t3,$t3,$carry
+ andc $a3,$a3,$carry
+ $ST $zero,$SIZE_T*3($tp)
+ or $acc0,$t0,$a0
+ or $acc1,$t1,$a1
+ $ST $zero,$SIZE_T*4($tp)
+ or $acc2,$t2,$a2
+ or $acc3,$t3,$a3
+ $ST $acc0,$SIZE_T*1($ap_end)
+ $ST $acc1,$SIZE_T*2($ap_end)
+ $ST $acc2,$SIZE_T*3($ap_end)
+ $ST $acc3,$SIZE_T*4($ap_end)
+
+ b .Lmul4x_done
+
+.align 4
+.Lmul4x4_post_condition:
+ $POP $ap,$SIZE_T*6($sp) # pull &rp[-1]
+ $POP $bp,0($sp) # pull saved sp
+ addze $carry,$carry # modulo-scheduled
+ # $acc0-3,$carry hold result, $m0-3 hold modulus
+ subfc $a0,$m0,$acc0
+ subfe $a1,$m1,$acc1
+ subfe $a2,$m2,$acc2
+ subfe $a3,$m3,$acc3
+ subfe $carry,$zero,$carry # did it borrow?
+
+ and $m0,$m0,$carry
+ and $m1,$m1,$carry
+ addc $a0,$a0,$m0
+ and $m2,$m2,$carry
+ adde $a1,$a1,$m1
+ and $m3,$m3,$carry
+ adde $a2,$a2,$m2
+ adde $a3,$a3,$m3
+
+ $ST $a0,$SIZE_T*1($ap) # write result
+ $ST $a1,$SIZE_T*2($ap)
+ $ST $a2,$SIZE_T*3($ap)
+ $ST $a3,$SIZE_T*4($ap)
+
+.Lmul4x_done:
+ $ST $zero,$SIZE_T*8($sp) # wipe stack clean
+ $ST $zero,$SIZE_T*9($sp)
+ $ST $zero,$SIZE_T*10($sp)
+ $ST $zero,$SIZE_T*11($sp)
+ li r3,1 # signal "done"
+ $POP r14,-$SIZE_T*18($bp)
+ $POP r15,-$SIZE_T*17($bp)
+ $POP r16,-$SIZE_T*16($bp)
+ $POP r17,-$SIZE_T*15($bp)
+ $POP r18,-$SIZE_T*14($bp)
+ $POP r19,-$SIZE_T*13($bp)
+ $POP r20,-$SIZE_T*12($bp)
+ $POP r21,-$SIZE_T*11($bp)
+ $POP r22,-$SIZE_T*10($bp)
+ $POP r23,-$SIZE_T*9($bp)
+ $POP r24,-$SIZE_T*8($bp)
+ $POP r25,-$SIZE_T*7($bp)
+ $POP r26,-$SIZE_T*6($bp)
+ $POP r27,-$SIZE_T*5($bp)
+ $POP r28,-$SIZE_T*4($bp)
+ $POP r29,-$SIZE_T*3($bp)
+ $POP r30,-$SIZE_T*2($bp)
+ $POP r31,-$SIZE_T*1($bp)
+ mr $sp,$bp
+ blr
+ .long 0
+ .byte 0,12,4,0x20,0x80,18,6,0
+ .long 0
+.size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int
+___
+}
+
+if (1) {
+########################################################################
+# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
+
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
+my ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
+my ($cnt,$carry,$zero)=("r30","r31","r0");
+my ($tp,$ap_end,$na0)=($bp,$np,$carry);
+
+# sp----------->+-------------------------------+
+# | saved sp |
+# +-------------------------------+
+# . .
+# +12*size_t +-------------------------------+
+# | size_t tmp[2*num] |
+# . .
+# . .
+# . .
+# +-------------------------------+
+# . .
+# -18*size_t +-------------------------------+
+# | 18 saved gpr, r14-r31 |
+# . .
+# . .
+# +-------------------------------+
+$code.=<<___;
+.align 5
+__bn_sqr8x_mont:
+.Lsqr8x_do:
+ mr $a0,$sp
+ slwi $a1,$num,`log($SIZE_T)/log(2)+1`
+ li $a2,-32*$SIZE_T
+ sub $a1,$a2,$a1
+ slwi $num,$num,`log($SIZE_T)/log(2)`
+ $STUX $sp,$sp,$a1 # alloca
+
+ $PUSH r14,-$SIZE_T*18($a0)
+ $PUSH r15,-$SIZE_T*17($a0)
+ $PUSH r16,-$SIZE_T*16($a0)
+ $PUSH r17,-$SIZE_T*15($a0)
+ $PUSH r18,-$SIZE_T*14($a0)
+ $PUSH r19,-$SIZE_T*13($a0)
+ $PUSH r20,-$SIZE_T*12($a0)
+ $PUSH r21,-$SIZE_T*11($a0)
+ $PUSH r22,-$SIZE_T*10($a0)
+ $PUSH r23,-$SIZE_T*9($a0)
+ $PUSH r24,-$SIZE_T*8($a0)
+ $PUSH r25,-$SIZE_T*7($a0)
+ $PUSH r26,-$SIZE_T*6($a0)
+ $PUSH r27,-$SIZE_T*5($a0)
+ $PUSH r28,-$SIZE_T*4($a0)
+ $PUSH r29,-$SIZE_T*3($a0)
+ $PUSH r30,-$SIZE_T*2($a0)
+ $PUSH r31,-$SIZE_T*1($a0)
+
+ subi $ap,$ap,$SIZE_T # bias by -1
+ subi $t0,$np,$SIZE_T # bias by -1
+ subi $rp,$rp,$SIZE_T # bias by -1
+ $LD $n0,0($n0) # *n0
+ li $zero,0
+
+ add $ap_end,$ap,$num
+ $LD $a0,$SIZE_T*1($ap)
+ #li $acc0,0
+ $LD $a1,$SIZE_T*2($ap)
+ li $acc1,0
+ $LD $a2,$SIZE_T*3($ap)
+ li $acc2,0
+ $LD $a3,$SIZE_T*4($ap)
+ li $acc3,0
+ $LD $a4,$SIZE_T*5($ap)
+ li $acc4,0
+ $LD $a5,$SIZE_T*6($ap)
+ li $acc5,0
+ $LD $a6,$SIZE_T*7($ap)
+ li $acc6,0
+ $LDU $a7,$SIZE_T*8($ap)
+ li $acc7,0
+
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+ subic. $cnt,$num,$SIZE_T*8
+ b .Lsqr8x_zero_start
+
+.align 5
+.Lsqr8x_zero:
+ subic. $cnt,$cnt,$SIZE_T*8
+ $ST $zero,$SIZE_T*1($tp)
+ $ST $zero,$SIZE_T*2($tp)
+ $ST $zero,$SIZE_T*3($tp)
+ $ST $zero,$SIZE_T*4($tp)
+ $ST $zero,$SIZE_T*5($tp)
+ $ST $zero,$SIZE_T*6($tp)
+ $ST $zero,$SIZE_T*7($tp)
+ $ST $zero,$SIZE_T*8($tp)
+.Lsqr8x_zero_start:
+ $ST $zero,$SIZE_T*9($tp)
+ $ST $zero,$SIZE_T*10($tp)
+ $ST $zero,$SIZE_T*11($tp)
+ $ST $zero,$SIZE_T*12($tp)
+ $ST $zero,$SIZE_T*13($tp)
+ $ST $zero,$SIZE_T*14($tp)
+ $ST $zero,$SIZE_T*15($tp)
+ $STU $zero,$SIZE_T*16($tp)
+ bne .Lsqr8x_zero
+
+ $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1]
+ $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1]
+ $PUSH $n0,$SIZE_T*8($sp) # offload n0
+ $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1]
+ $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+
+ # Multiply everything but a[i]*a[i]
+.align 5
+.Lsqr8x_outer_loop:
+ # a[1]a[0] (i)
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[2]a[1] (ii)
+ # a[3]a[1]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[3]a[2] (iii)
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3] (iv)
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ # a[5]a[4] (v)
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5] (vi)
+ # a[7]a[5]
+ # a[7]a[6] (vii)
+
+ $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i)
+ $UMULL $t1,$a2,$a0
+ $UMULL $t2,$a3,$a0
+ $UMULL $t3,$a4,$a0
+ addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0])
+ $UMULL $t0,$a5,$a0
+ adde $acc2,$acc2,$t1
+ $UMULL $t1,$a6,$a0
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$a7,$a0
+ adde $acc4,$acc4,$t3
+ $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0])
+ adde $acc5,$acc5,$t0
+ $UMULH $t0,$a2,$a0
+ adde $acc6,$acc6,$t1
+ $UMULH $t1,$a3,$a0
+ adde $acc7,$acc7,$t2
+ $UMULH $t2,$a4,$a0
+ $ST $acc0,$SIZE_T*1($tp) # t[0]
+ addze $acc0,$zero # t[8]
+ $ST $acc1,$SIZE_T*2($tp) # t[1]
+ addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0])
+ $UMULH $t3,$a5,$a0
+ adde $acc3,$acc3,$t0
+ $UMULH $t0,$a6,$a0
+ adde $acc4,$acc4,$t1
+ $UMULH $t1,$a7,$a0
+ adde $acc5,$acc5,$t2
+ $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii)
+ adde $acc6,$acc6,$t3
+ $UMULL $t3,$a3,$a1
+ adde $acc7,$acc7,$t0
+ $UMULL $t0,$a4,$a1
+ adde $acc0,$acc0,$t1
+
+ $UMULL $t1,$a5,$a1
+ addc $acc3,$acc3,$t2
+ $UMULL $t2,$a6,$a1
+ adde $acc4,$acc4,$t3
+ $UMULL $t3,$a7,$a1
+ adde $acc5,$acc5,$t0
+ $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1])
+ adde $acc6,$acc6,$t1
+ $UMULH $t1,$a3,$a1
+ adde $acc7,$acc7,$t2
+ $UMULH $t2,$a4,$a1
+ adde $acc0,$acc0,$t3
+ $UMULH $t3,$a5,$a1
+ $ST $acc2,$SIZE_T*3($tp) # t[2]
+ addze $acc1,$zero # t[9]
+ $ST $acc3,$SIZE_T*4($tp) # t[3]
+ addc $acc4,$acc4,$t0
+ $UMULH $t0,$a6,$a1
+ adde $acc5,$acc5,$t1
+ $UMULH $t1,$a7,$a1
+ adde $acc6,$acc6,$t2
+ $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii)
+ adde $acc7,$acc7,$t3
+ $UMULL $t3,$a4,$a2
+ adde $acc0,$acc0,$t0
+ $UMULL $t0,$a5,$a2
+ adde $acc1,$acc1,$t1
+
+ $UMULL $t1,$a6,$a2
+ addc $acc5,$acc5,$t2
+ $UMULL $t2,$a7,$a2
+ adde $acc6,$acc6,$t3
+ $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2])
+ adde $acc7,$acc7,$t0
+ $UMULH $t0,$a4,$a2
+ adde $acc0,$acc0,$t1
+ $UMULH $t1,$a5,$a2
+ adde $acc1,$acc1,$t2
+ $UMULH $t2,$a6,$a2
+ $ST $acc4,$SIZE_T*5($tp) # t[4]
+ addze $acc2,$zero # t[10]
+ $ST $acc5,$SIZE_T*6($tp) # t[5]
+ addc $acc6,$acc6,$t3
+ $UMULH $t3,$a7,$a2
+ adde $acc7,$acc7,$t0
+ $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv)
+ adde $acc0,$acc0,$t1
+ $UMULL $t1,$a5,$a3
+ adde $acc1,$acc1,$t2
+ $UMULL $t2,$a6,$a3
+ adde $acc2,$acc2,$t3
+
+ $UMULL $t3,$a7,$a3
+ addc $acc7,$acc7,$t0
+ $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3])
+ adde $acc0,$acc0,$t1
+ $UMULH $t1,$a5,$a3
+ adde $acc1,$acc1,$t2
+ $UMULH $t2,$a6,$a3
+ adde $acc2,$acc2,$t3
+ $UMULH $t3,$a7,$a3
+ $ST $acc6,$SIZE_T*7($tp) # t[6]
+ addze $acc3,$zero # t[11]
+ $STU $acc7,$SIZE_T*8($tp) # t[7]
+ addc $acc0,$acc0,$t0
+ $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v)
+ adde $acc1,$acc1,$t1
+ $UMULL $t1,$a6,$a4
+ adde $acc2,$acc2,$t2
+ $UMULL $t2,$a7,$a4
+ adde $acc3,$acc3,$t3
+
+ $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4])
+ addc $acc1,$acc1,$t0
+ $UMULH $t0,$a6,$a4
+ adde $acc2,$acc2,$t1
+ $UMULH $t1,$a7,$a4
+ adde $acc3,$acc3,$t2
+ $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi)
+ addze $acc4,$zero # t[12]
+ addc $acc2,$acc2,$t3
+ $UMULL $t3,$a7,$a5
+ adde $acc3,$acc3,$t0
+ $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5])
+ adde $acc4,$acc4,$t1
+
+ $UMULH $t1,$a7,$a5
+ addc $acc3,$acc3,$t2
+ $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii)
+ adde $acc4,$acc4,$t3
+ $UMULH $t3,$a7,$a6 # hi(a[7]*a[6])
+ addze $acc5,$zero # t[13]
+ addc $acc4,$acc4,$t0
+ $UCMP $ap_end,$ap # done yet?
+ adde $acc5,$acc5,$t1
+
+ addc $acc5,$acc5,$t2
+ sub $t0,$ap_end,$num # rewinded ap
+ addze $acc6,$zero # t[14]
+ add $acc6,$acc6,$t3
+
+ beq .Lsqr8x_outer_break
+
+ mr $n0,$a0
+ $LD $a0,$SIZE_T*1($tp)
+ $LD $a1,$SIZE_T*2($tp)
+ $LD $a2,$SIZE_T*3($tp)
+ $LD $a3,$SIZE_T*4($tp)
+ $LD $a4,$SIZE_T*5($tp)
+ $LD $a5,$SIZE_T*6($tp)
+ $LD $a6,$SIZE_T*7($tp)
+ $LD $a7,$SIZE_T*8($tp)
+ addc $acc0,$acc0,$a0
+ $LD $a0,$SIZE_T*1($ap)
+ adde $acc1,$acc1,$a1
+ $LD $a1,$SIZE_T*2($ap)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($ap)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($ap)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($ap)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($ap)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($ap)
+ subi $rp,$ap,$SIZE_T*7
+ addze $acc7,$a7
+ $LDU $a7,$SIZE_T*8($ap)
+ #addze $carry,$zero # moved below
+ li $cnt,0
+ b .Lsqr8x_mul
+
+ # a[8]a[0]
+ # a[9]a[0]
+ # a[a]a[0]
+ # a[b]a[0]
+ # a[c]a[0]
+ # a[d]a[0]
+ # a[e]a[0]
+ # a[f]a[0]
+ # a[8]a[1]
+ # a[f]a[1]........................
+ # a[8]a[2]
+ # a[f]a[2]........................
+ # a[8]a[3]
+ # a[f]a[3]........................
+ # a[8]a[4]
+ # a[f]a[4]........................
+ # a[8]a[5]
+ # a[f]a[5]........................
+ # a[8]a[6]
+ # a[f]a[6]........................
+ # a[8]a[7]
+ # a[f]a[7]........................
+.align 5
+.Lsqr8x_mul:
+ $UMULL $t0,$a0,$n0
+ addze $carry,$zero # carry bit, modulo-scheduled
+ $UMULL $t1,$a1,$n0
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$n0
+ andi. $cnt,$cnt,$SIZE_T*8-1
+ $UMULL $t3,$a3,$n0
+ addc $acc0,$acc0,$t0
+ $UMULL $t0,$a4,$n0
+ adde $acc1,$acc1,$t1
+ $UMULL $t1,$a5,$n0
+ adde $acc2,$acc2,$t2
+ $UMULL $t2,$a6,$n0
+ adde $acc3,$acc3,$t3
+ $UMULL $t3,$a7,$n0
+ adde $acc4,$acc4,$t0
+ $UMULH $t0,$a0,$n0
+ adde $acc5,$acc5,$t1
+ $UMULH $t1,$a1,$n0
+ adde $acc6,$acc6,$t2
+ $UMULH $t2,$a2,$n0
+ adde $acc7,$acc7,$t3
+ $UMULH $t3,$a3,$n0
+ addze $carry,$carry
+ $STU $acc0,$SIZE_T($tp)
+ addc $acc0,$acc1,$t0
+ $UMULH $t0,$a4,$n0
+ adde $acc1,$acc2,$t1
+ $UMULH $t1,$a5,$n0
+ adde $acc2,$acc3,$t2
+ $UMULH $t2,$a6,$n0
+ adde $acc3,$acc4,$t3
+ $UMULH $t3,$a7,$n0
+ $LDX $n0,$rp,$cnt
+ adde $acc4,$acc5,$t0
+ adde $acc5,$acc6,$t1
+ adde $acc6,$acc7,$t2
+ adde $acc7,$carry,$t3
+ #addze $carry,$zero # moved above
+ bne .Lsqr8x_mul
+ # note that carry flag is guaranteed
+ # to be zero at this point
+ $UCMP $ap,$ap_end # done yet?
+ beq .Lsqr8x_break
+
+ $LD $a0,$SIZE_T*1($tp)
+ $LD $a1,$SIZE_T*2($tp)
+ $LD $a2,$SIZE_T*3($tp)
+ $LD $a3,$SIZE_T*4($tp)
+ $LD $a4,$SIZE_T*5($tp)
+ $LD $a5,$SIZE_T*6($tp)
+ $LD $a6,$SIZE_T*7($tp)
+ $LD $a7,$SIZE_T*8($tp)
+ addc $acc0,$acc0,$a0
+ $LD $a0,$SIZE_T*1($ap)
+ adde $acc1,$acc1,$a1
+ $LD $a1,$SIZE_T*2($ap)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($ap)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($ap)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($ap)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($ap)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($ap)
+ adde $acc7,$acc7,$a7
+ $LDU $a7,$SIZE_T*8($ap)
+ #addze $carry,$zero # moved above
+ b .Lsqr8x_mul
+
+.align 5
+.Lsqr8x_break:
+ $LD $a0,$SIZE_T*8($rp)
+ addi $ap,$rp,$SIZE_T*15
+ $LD $a1,$SIZE_T*9($rp)
+ sub. $t0,$ap_end,$ap # is it last iteration?
+ $LD $a2,$SIZE_T*10($rp)
+ sub $t1,$tp,$t0
+ $LD $a3,$SIZE_T*11($rp)
+ $LD $a4,$SIZE_T*12($rp)
+ $LD $a5,$SIZE_T*13($rp)
+ $LD $a6,$SIZE_T*14($rp)
+ $LD $a7,$SIZE_T*15($rp)
+ beq .Lsqr8x_outer_loop
+
+ $ST $acc0,$SIZE_T*1($tp)
+ $LD $acc0,$SIZE_T*1($t1)
+ $ST $acc1,$SIZE_T*2($tp)
+ $LD $acc1,$SIZE_T*2($t1)
+ $ST $acc2,$SIZE_T*3($tp)
+ $LD $acc2,$SIZE_T*3($t1)
+ $ST $acc3,$SIZE_T*4($tp)
+ $LD $acc3,$SIZE_T*4($t1)
+ $ST $acc4,$SIZE_T*5($tp)
+ $LD $acc4,$SIZE_T*5($t1)
+ $ST $acc5,$SIZE_T*6($tp)
+ $LD $acc5,$SIZE_T*6($t1)
+ $ST $acc6,$SIZE_T*7($tp)
+ $LD $acc6,$SIZE_T*7($t1)
+ $ST $acc7,$SIZE_T*8($tp)
+ $LD $acc7,$SIZE_T*8($t1)
+ mr $tp,$t1
+ b .Lsqr8x_outer_loop
+
+.align 5
+.Lsqr8x_outer_break:
+ ####################################################################
+ # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1]
+ $LD $a3,$SIZE_T*2($t0)
+ $LD $a5,$SIZE_T*3($t0)
+ $LD $a7,$SIZE_T*4($t0)
+ addi $ap,$t0,$SIZE_T*4
+ # "tp[x]" comments are for num==8 case
+ $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting
+ $LD $t2,$SIZE_T*14($sp)
+ $LD $t3,$SIZE_T*15($sp)
+ $LD $t0,$SIZE_T*16($sp)
+
+ $ST $acc0,$SIZE_T*1($tp) # tp[8]=
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
+ $ST $acc1,$SIZE_T*2($tp)
+ subi $cnt,$cnt,1
+ $ST $acc2,$SIZE_T*3($tp)
+ $ST $acc3,$SIZE_T*4($tp)
+ $ST $acc4,$SIZE_T*5($tp)
+ $ST $acc5,$SIZE_T*6($tp)
+ $ST $acc6,$SIZE_T*7($tp)
+ #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+ $UMULL $acc0,$a1,$a1
+ $UMULH $a1,$a1,$a1
+ add $acc1,$t1,$t1 # <<1
+ $SHRI $t1,$t1,$BITS-1
+ $UMULL $a2,$a3,$a3
+ $UMULH $a3,$a3,$a3
+ addc $acc1,$acc1,$a1
+ add $acc2,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ add $acc3,$t3,$t3
+ $SHRI $t3,$t3,$BITS-1
+ or $acc2,$acc2,$t1
+
+ mtctr $cnt
+.Lsqr4x_shift_n_add:
+ $UMULL $a4,$a5,$a5
+ $UMULH $a5,$a5,$a5
+ $LD $t1,$SIZE_T*6($tp) # =tp[5]
+ $LD $a1,$SIZE_T*1($ap)
+ adde $acc2,$acc2,$a2
+ add $acc4,$t0,$t0
+ $SHRI $t0,$t0,$BITS-1
+ or $acc3,$acc3,$t2
+ $LD $t2,$SIZE_T*7($tp) # =tp[6]
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*2($ap)
+ add $acc5,$t1,$t1
+ $SHRI $t1,$t1,$BITS-1
+ or $acc4,$acc4,$t3
+ $LD $t3,$SIZE_T*8($tp) # =tp[7]
+ $UMULL $a6,$a7,$a7
+ $UMULH $a7,$a7,$a7
+ adde $acc4,$acc4,$a4
+ add $acc6,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ or $acc5,$acc5,$t0
+ $LD $t0,$SIZE_T*9($tp) # =tp[8]
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*3($ap)
+ add $acc7,$t3,$t3
+ $SHRI $t3,$t3,$BITS-1
+ or $acc6,$acc6,$t1
+ $LD $t1,$SIZE_T*10($tp) # =tp[9]
+ $UMULL $a0,$a1,$a1
+ $UMULH $a1,$a1,$a1
+ adde $acc6,$acc6,$a6
+ $ST $acc0,$SIZE_T*1($tp) # tp[0]=
+ add $acc0,$t0,$t0
+ $SHRI $t0,$t0,$BITS-1
+ or $acc7,$acc7,$t2
+ $LD $t2,$SIZE_T*11($tp) # =tp[10]
+ adde $acc7,$acc7,$a7
+ $LDU $a7,$SIZE_T*4($ap)
+ $ST $acc1,$SIZE_T*2($tp) # tp[1]=
+ add $acc1,$t1,$t1
+ $SHRI $t1,$t1,$BITS-1
+ or $acc0,$acc0,$t3
+ $LD $t3,$SIZE_T*12($tp) # =tp[11]
+ $UMULL $a2,$a3,$a3
+ $UMULH $a3,$a3,$a3
+ adde $acc0,$acc0,$a0
+ $ST $acc2,$SIZE_T*3($tp) # tp[2]=
+ add $acc2,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ or $acc1,$acc1,$t0
+ $LD $t0,$SIZE_T*13($tp) # =tp[12]
+ adde $acc1,$acc1,$a1
+ $ST $acc3,$SIZE_T*4($tp) # tp[3]=
+ $ST $acc4,$SIZE_T*5($tp) # tp[4]=
+ $ST $acc5,$SIZE_T*6($tp) # tp[5]=
+ $ST $acc6,$SIZE_T*7($tp) # tp[6]=
+ $STU $acc7,$SIZE_T*8($tp) # tp[7]=
+ add $acc3,$t3,$t3
+ $SHRI $t3,$t3,$BITS-1
+ or $acc2,$acc2,$t1
+ bdnz .Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+ $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0
+ $POP $n0,$SIZE_T*8($sp)
+
+ $UMULL $a4,$a5,$a5
+ $UMULH $a5,$a5,$a5
+ $ST $acc0,$SIZE_T*1($tp) # tp[8]=
+ $LD $acc0,$SIZE_T*12($sp) # =tp[0]
+ $LD $t1,$SIZE_T*6($tp) # =tp[13]
+ adde $acc2,$acc2,$a2
+ add $acc4,$t0,$t0
+ $SHRI $t0,$t0,$BITS-1
+ or $acc3,$acc3,$t2
+ $LD $t2,$SIZE_T*7($tp) # =tp[14]
+ adde $acc3,$acc3,$a3
+ add $acc5,$t1,$t1
+ $SHRI $t1,$t1,$BITS-1
+ or $acc4,$acc4,$t3
+ $UMULL $a6,$a7,$a7
+ $UMULH $a7,$a7,$a7
+ adde $acc4,$acc4,$a4
+ add $acc6,$t2,$t2
+ $SHRI $t2,$t2,$BITS-1
+ or $acc5,$acc5,$t0
+ $ST $acc1,$SIZE_T*2($tp) # tp[9]=
+ $LD $acc1,$SIZE_T*13($sp) # =tp[1]
+ adde $acc5,$acc5,$a5
+ or $acc6,$acc6,$t1
+ $LD $a0,$SIZE_T*1($np)
+ $LD $a1,$SIZE_T*2($np)
+ adde $acc6,$acc6,$a6
+ $LD $a2,$SIZE_T*3($np)
+ $LD $a3,$SIZE_T*4($np)
+ adde $acc7,$a7,$t2
+ $LD $a4,$SIZE_T*5($np)
+ $LD $a5,$SIZE_T*6($np)
+
+ ################################################################
+ # Reduce by 8 limbs per iteration
+ $UMULL $na0,$n0,$acc0 # t[0]*n0
+ li $cnt,8
+ $LD $a6,$SIZE_T*7($np)
+ add $np_end,$np,$num
+ $LDU $a7,$SIZE_T*8($np)
+ $ST $acc2,$SIZE_T*3($tp) # tp[10]=
+ $LD $acc2,$SIZE_T*14($sp)
+ $ST $acc3,$SIZE_T*4($tp) # tp[11]=
+ $LD $acc3,$SIZE_T*15($sp)
+ $ST $acc4,$SIZE_T*5($tp) # tp[12]=
+ $LD $acc4,$SIZE_T*16($sp)
+ $ST $acc5,$SIZE_T*6($tp) # tp[13]=
+ $LD $acc5,$SIZE_T*17($sp)
+ $ST $acc6,$SIZE_T*7($tp) # tp[14]=
+ $LD $acc6,$SIZE_T*18($sp)
+ $ST $acc7,$SIZE_T*8($tp) # tp[15]=
+ $LD $acc7,$SIZE_T*19($sp)
+ addi $tp,$sp,$SIZE_T*11 # &tp[-1]
+ mtctr $cnt
+ b .Lsqr8x_reduction
+
+.align 5
+.Lsqr8x_reduction:
+ # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0)
+ $UMULL $t1,$a1,$na0
+ $UMULL $t2,$a2,$na0
+ $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
+ $UMULL $t3,$a3,$na0
+ # (*) addc $acc0,$acc0,$t0
+ addic $acc0,$acc0,-1 # (*)
+ $UMULL $t0,$a4,$na0
+ adde $acc0,$acc1,$t1
+ $UMULL $t1,$a5,$na0
+ adde $acc1,$acc2,$t2
+ $UMULL $t2,$a6,$na0
+ adde $acc2,$acc3,$t3
+ $UMULL $t3,$a7,$na0
+ adde $acc3,$acc4,$t0
+ $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0)
+ adde $acc4,$acc5,$t1
+ $UMULH $t1,$a1,$na0
+ adde $acc5,$acc6,$t2
+ $UMULH $t2,$a2,$na0
+ adde $acc6,$acc7,$t3
+ $UMULH $t3,$a3,$na0
+ addze $acc7,$zero
+ addc $acc0,$acc0,$t0
+ $UMULH $t0,$a4,$na0
+ adde $acc1,$acc1,$t1
+ $UMULH $t1,$a5,$na0
+ adde $acc2,$acc2,$t2
+ $UMULH $t2,$a6,$na0
+ adde $acc3,$acc3,$t3
+ $UMULH $t3,$a7,$na0
+ $UMULL $na0,$n0,$acc0 # next t[0]*n0
+ adde $acc4,$acc4,$t0
+ adde $acc5,$acc5,$t1
+ adde $acc6,$acc6,$t2
+ adde $acc7,$acc7,$t3
+ bdnz .Lsqr8x_reduction
+
+ $LD $t0,$SIZE_T*1($tp)
+ $LD $t1,$SIZE_T*2($tp)
+ $LD $t2,$SIZE_T*3($tp)
+ $LD $t3,$SIZE_T*4($tp)
+ subi $rp,$tp,$SIZE_T*7
+ $UCMP $np_end,$np # done yet?
+ addc $acc0,$acc0,$t0
+ $LD $t0,$SIZE_T*5($tp)
+ adde $acc1,$acc1,$t1
+ $LD $t1,$SIZE_T*6($tp)
+ adde $acc2,$acc2,$t2
+ $LD $t2,$SIZE_T*7($tp)
+ adde $acc3,$acc3,$t3
+ $LD $t3,$SIZE_T*8($tp)
+ adde $acc4,$acc4,$t0
+ adde $acc5,$acc5,$t1
+ adde $acc6,$acc6,$t2
+ adde $acc7,$acc7,$t3
+ #addze $carry,$zero # moved below
+ beq .Lsqr8x8_post_condition
+
+ $LD $n0,$SIZE_T*0($rp)
+ $LD $a0,$SIZE_T*1($np)
+ $LD $a1,$SIZE_T*2($np)
+ $LD $a2,$SIZE_T*3($np)
+ $LD $a3,$SIZE_T*4($np)
+ $LD $a4,$SIZE_T*5($np)
+ $LD $a5,$SIZE_T*6($np)
+ $LD $a6,$SIZE_T*7($np)
+ $LDU $a7,$SIZE_T*8($np)
+ li $cnt,0
+
+.align 5
+.Lsqr8x_tail:
+ $UMULL $t0,$a0,$n0
+ addze $carry,$zero # carry bit, modulo-scheduled
+ $UMULL $t1,$a1,$n0
+ addi $cnt,$cnt,$SIZE_T
+ $UMULL $t2,$a2,$n0
+ andi. $cnt,$cnt,$SIZE_T*8-1
+ $UMULL $t3,$a3,$n0
+ addc $acc0,$acc0,$t0
+ $UMULL $t0,$a4,$n0
+ adde $acc1,$acc1,$t1
+ $UMULL $t1,$a5,$n0
+ adde $acc2,$acc2,$t2
+ $UMULL $t2,$a6,$n0
+ adde $acc3,$acc3,$t3
+ $UMULL $t3,$a7,$n0
+ adde $acc4,$acc4,$t0
+ $UMULH $t0,$a0,$n0
+ adde $acc5,$acc5,$t1
+ $UMULH $t1,$a1,$n0
+ adde $acc6,$acc6,$t2
+ $UMULH $t2,$a2,$n0
+ adde $acc7,$acc7,$t3
+ $UMULH $t3,$a3,$n0
+ addze $carry,$carry
+ $STU $acc0,$SIZE_T($tp)
+ addc $acc0,$acc1,$t0
+ $UMULH $t0,$a4,$n0
+ adde $acc1,$acc2,$t1
+ $UMULH $t1,$a5,$n0
+ adde $acc2,$acc3,$t2
+ $UMULH $t2,$a6,$n0
+ adde $acc3,$acc4,$t3
+ $UMULH $t3,$a7,$n0
+ $LDX $n0,$rp,$cnt
+ adde $acc4,$acc5,$t0
+ adde $acc5,$acc6,$t1
+ adde $acc6,$acc7,$t2
+ adde $acc7,$carry,$t3
+ #addze $carry,$zero # moved above
+ bne .Lsqr8x_tail
+ # note that carry flag is guaranteed
+ # to be zero at this point
+ $LD $a0,$SIZE_T*1($tp)
+ $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break
+ $UCMP $np_end,$np # done yet?
+ $LD $a1,$SIZE_T*2($tp)
+ sub $t2,$np_end,$num # rewinded np
+ $LD $a2,$SIZE_T*3($tp)
+ $LD $a3,$SIZE_T*4($tp)
+ $LD $a4,$SIZE_T*5($tp)
+ $LD $a5,$SIZE_T*6($tp)
+ $LD $a6,$SIZE_T*7($tp)
+ $LD $a7,$SIZE_T*8($tp)
+ beq .Lsqr8x_tail_break
+
+ addc $acc0,$acc0,$a0
+ $LD $a0,$SIZE_T*1($np)
+ adde $acc1,$acc1,$a1
+ $LD $a1,$SIZE_T*2($np)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($np)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($np)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($np)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($np)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($np)
+ adde $acc7,$acc7,$a7
+ $LDU $a7,$SIZE_T*8($np)
+ #addze $carry,$zero # moved above
+ b .Lsqr8x_tail
+
+.align 5
+.Lsqr8x_tail_break:
+ $POP $n0,$SIZE_T*8($sp) # pull n0
+ $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1]
+ addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window
+
+ addic $carry,$carry,-1 # "move" top-most carry to carry bit
+ adde $t0,$acc0,$a0
+ $LD $acc0,$SIZE_T*8($rp)
+ $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1]
+ adde $t1,$acc1,$a1
+ $LD $acc1,$SIZE_T*9($rp)
+ $LD $a1,$SIZE_T*2($t2)
+ adde $acc2,$acc2,$a2
+ $LD $a2,$SIZE_T*3($t2)
+ adde $acc3,$acc3,$a3
+ $LD $a3,$SIZE_T*4($t2)
+ adde $acc4,$acc4,$a4
+ $LD $a4,$SIZE_T*5($t2)
+ adde $acc5,$acc5,$a5
+ $LD $a5,$SIZE_T*6($t2)
+ adde $acc6,$acc6,$a6
+ $LD $a6,$SIZE_T*7($t2)
+ adde $acc7,$acc7,$a7
+ $LD $a7,$SIZE_T*8($t2)
+ addi $np,$t2,$SIZE_T*8
+ addze $t2,$zero # top-most carry
+ $UMULL $na0,$n0,$acc0
+ $ST $t0,$SIZE_T*1($tp)
+ $UCMP $cnt,$t3 # did we hit the bottom?
+ $ST $t1,$SIZE_T*2($tp)
+ li $cnt,8
+ $ST $acc2,$SIZE_T*3($tp)
+ $LD $acc2,$SIZE_T*10($rp)
+ $ST $acc3,$SIZE_T*4($tp)
+ $LD $acc3,$SIZE_T*11($rp)
+ $ST $acc4,$SIZE_T*5($tp)
+ $LD $acc4,$SIZE_T*12($rp)
+ $ST $acc5,$SIZE_T*6($tp)
+ $LD $acc5,$SIZE_T*13($rp)
+ $ST $acc6,$SIZE_T*7($tp)
+ $LD $acc6,$SIZE_T*14($rp)
+ $ST $acc7,$SIZE_T*8($tp)
+ $LD $acc7,$SIZE_T*15($rp)
+ $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry
+ addi $tp,$rp,$SIZE_T*7 # slide the window
+ mtctr $cnt
+ bne .Lsqr8x_reduction
+
+ ################################################################
+ # Final step. We see if result is larger than modulus, and
+ # if it is, subtract the modulus. But comparison implies
+ # subtraction. So we subtract modulus, see if it borrowed,
+ # and conditionally copy original value.
+ $POP $rp,$SIZE_T*6($sp) # pull &rp[-1]
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+3`
+ mr $n0,$tp # put tp aside
+ addi $tp,$tp,$SIZE_T*8
+ subi $cnt,$cnt,1
+ subfc $t0,$a0,$acc0
+ subfe $t1,$a1,$acc1
+ mr $carry,$t2
+ mr $ap_end,$rp # $rp copy
+
+ mtctr $cnt
+ b .Lsqr8x_sub
+
+.align 5
+.Lsqr8x_sub:
+ $LD $a0,$SIZE_T*1($np)
+ $LD $acc0,$SIZE_T*1($tp)
+ $LD $a1,$SIZE_T*2($np)
+ $LD $acc1,$SIZE_T*2($tp)
+ subfe $t2,$a2,$acc2
+ $LD $a2,$SIZE_T*3($np)
+ $LD $acc2,$SIZE_T*3($tp)
+ subfe $t3,$a3,$acc3
+ $LD $a3,$SIZE_T*4($np)
+ $LD $acc3,$SIZE_T*4($tp)
+ $ST $t0,$SIZE_T*1($rp)
+ subfe $t0,$a4,$acc4
+ $LD $a4,$SIZE_T*5($np)
+ $LD $acc4,$SIZE_T*5($tp)
+ $ST $t1,$SIZE_T*2($rp)
+ subfe $t1,$a5,$acc5
+ $LD $a5,$SIZE_T*6($np)
+ $LD $acc5,$SIZE_T*6($tp)
+ $ST $t2,$SIZE_T*3($rp)
+ subfe $t2,$a6,$acc6
+ $LD $a6,$SIZE_T*7($np)
+ $LD $acc6,$SIZE_T*7($tp)
+ $ST $t3,$SIZE_T*4($rp)
+ subfe $t3,$a7,$acc7
+ $LDU $a7,$SIZE_T*8($np)
+ $LDU $acc7,$SIZE_T*8($tp)
+ $ST $t0,$SIZE_T*5($rp)
+ subfe $t0,$a0,$acc0
+ $ST $t1,$SIZE_T*6($rp)
+ subfe $t1,$a1,$acc1
+ $ST $t2,$SIZE_T*7($rp)
+ $STU $t3,$SIZE_T*8($rp)
+ bdnz .Lsqr8x_sub
+
+ srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
+ $LD $a0,$SIZE_T*1($ap_end) # original $rp
+ $LD $acc0,$SIZE_T*1($n0) # original $tp
+ subi $cnt,$cnt,1
+ $LD $a1,$SIZE_T*2($ap_end)
+ $LD $acc1,$SIZE_T*2($n0)
+ subfe $t2,$a2,$acc2
+ $LD $a2,$SIZE_T*3($ap_end)
+ $LD $acc2,$SIZE_T*3($n0)
+ subfe $t3,$a3,$acc3
+ $LD $a3,$SIZE_T*4($ap_end)
+ $LDU $acc3,$SIZE_T*4($n0)
+ $ST $t0,$SIZE_T*1($rp)
+ subfe $t0,$a4,$acc4
+ $ST $t1,$SIZE_T*2($rp)
+ subfe $t1,$a5,$acc5
+ $ST $t2,$SIZE_T*3($rp)
+ subfe $t2,$a6,$acc6
+ $ST $t3,$SIZE_T*4($rp)
+ subfe $t3,$a7,$acc7
+ $ST $t0,$SIZE_T*5($rp)
+ subfe $carry,$zero,$carry # did it borrow?
+ $ST $t1,$SIZE_T*6($rp)
+ $ST $t2,$SIZE_T*7($rp)
+ $ST $t3,$SIZE_T*8($rp)
+
+ addi $tp,$sp,$SIZE_T*11
+ mtctr $cnt
+
+.Lsqr4x_cond_copy:
+ andc $a0,$a0,$carry
+ $ST $zero,-$SIZE_T*3($n0) # wipe stack clean
+ and $acc0,$acc0,$carry
+ $ST $zero,-$SIZE_T*2($n0)
+ andc $a1,$a1,$carry
+ $ST $zero,-$SIZE_T*1($n0)
+ and $acc1,$acc1,$carry
+ $ST $zero,-$SIZE_T*0($n0)
+ andc $a2,$a2,$carry
+ $ST $zero,$SIZE_T*1($tp)
+ and $acc2,$acc2,$carry
+ $ST $zero,$SIZE_T*2($tp)
+ andc $a3,$a3,$carry
+ $ST $zero,$SIZE_T*3($tp)
+ and $acc3,$acc3,$carry
+ $STU $zero,$SIZE_T*4($tp)
+ or $t0,$a0,$acc0
+ $LD $a0,$SIZE_T*5($ap_end)
+ $LD $acc0,$SIZE_T*1($n0)
+ or $t1,$a1,$acc1
+ $LD $a1,$SIZE_T*6($ap_end)
+ $LD $acc1,$SIZE_T*2($n0)
+ or $t2,$a2,$acc2
+ $LD $a2,$SIZE_T*7($ap_end)
+ $LD $acc2,$SIZE_T*3($n0)
+ or $t3,$a3,$acc3
+ $LD $a3,$SIZE_T*8($ap_end)
+ $LDU $acc3,$SIZE_T*4($n0)
+ $ST $t0,$SIZE_T*1($ap_end)
+ $ST $t1,$SIZE_T*2($ap_end)
+ $ST $t2,$SIZE_T*3($ap_end)
+ $STU $t3,$SIZE_T*4($ap_end)
+ bdnz .Lsqr4x_cond_copy
+
+ $POP $ap,0($sp) # pull saved sp
+ andc $a0,$a0,$carry
+ and $acc0,$acc0,$carry
+ andc $a1,$a1,$carry
+ and $acc1,$acc1,$carry
+ andc $a2,$a2,$carry
+ and $acc2,$acc2,$carry
+ andc $a3,$a3,$carry
+ and $acc3,$acc3,$carry
+ or $t0,$a0,$acc0
+ or $t1,$a1,$acc1
+ or $t2,$a2,$acc2
+ or $t3,$a3,$acc3
+ $ST $t0,$SIZE_T*1($ap_end)
+ $ST $t1,$SIZE_T*2($ap_end)
+ $ST $t2,$SIZE_T*3($ap_end)
+ $ST $t3,$SIZE_T*4($ap_end)
+
+ b .Lsqr8x_done
+
+.align 5
+.Lsqr8x8_post_condition:
+ $POP $rp,$SIZE_T*6($sp) # pull rp
+ $POP $ap,0($sp) # pull saved sp
+ addze $carry,$zero
+
+ # $acc0-7,$carry hold result, $a0-7 hold modulus
+ subfc $acc0,$a0,$acc0
+ subfe $acc1,$a1,$acc1
+ $ST $zero,$SIZE_T*12($sp) # wipe stack clean
+ $ST $zero,$SIZE_T*13($sp)
+ subfe $acc2,$a2,$acc2
+ $ST $zero,$SIZE_T*14($sp)
+ $ST $zero,$SIZE_T*15($sp)
+ subfe $acc3,$a3,$acc3
+ $ST $zero,$SIZE_T*16($sp)
+ $ST $zero,$SIZE_T*17($sp)
+ subfe $acc4,$a4,$acc4
+ $ST $zero,$SIZE_T*18($sp)
+ $ST $zero,$SIZE_T*19($sp)
+ subfe $acc5,$a5,$acc5
+ $ST $zero,$SIZE_T*20($sp)
+ $ST $zero,$SIZE_T*21($sp)
+ subfe $acc6,$a6,$acc6
+ $ST $zero,$SIZE_T*22($sp)
+ $ST $zero,$SIZE_T*23($sp)
+ subfe $acc7,$a7,$acc7
+ $ST $zero,$SIZE_T*24($sp)
+ $ST $zero,$SIZE_T*25($sp)
+ subfe $carry,$zero,$carry # did it borrow?
+ $ST $zero,$SIZE_T*26($sp)
+ $ST $zero,$SIZE_T*27($sp)
+
+ and $a0,$a0,$carry
+ and $a1,$a1,$carry
+ addc $acc0,$acc0,$a0 # add modulus back if borrowed
+ and $a2,$a2,$carry
+ adde $acc1,$acc1,$a1
+ and $a3,$a3,$carry
+ adde $acc2,$acc2,$a2
+ and $a4,$a4,$carry
+ adde $acc3,$acc3,$a3
+ and $a5,$a5,$carry
+ adde $acc4,$acc4,$a4
+ and $a6,$a6,$carry
+ adde $acc5,$acc5,$a5
+ and $a7,$a7,$carry
+ adde $acc6,$acc6,$a6
+ adde $acc7,$acc7,$a7
+ $ST $acc0,$SIZE_T*1($rp)
+ $ST $acc1,$SIZE_T*2($rp)
+ $ST $acc2,$SIZE_T*3($rp)
+ $ST $acc3,$SIZE_T*4($rp)
+ $ST $acc4,$SIZE_T*5($rp)
+ $ST $acc5,$SIZE_T*6($rp)
+ $ST $acc6,$SIZE_T*7($rp)
+ $ST $acc7,$SIZE_T*8($rp)
+
+.Lsqr8x_done:
+ $PUSH $zero,$SIZE_T*8($sp)
+ $PUSH $zero,$SIZE_T*10($sp)
+
+ $POP r14,-$SIZE_T*18($ap)
+ li r3,1 # signal "done"
+ $POP r15,-$SIZE_T*17($ap)
+ $POP r16,-$SIZE_T*16($ap)
+ $POP r17,-$SIZE_T*15($ap)
+ $POP r18,-$SIZE_T*14($ap)
+ $POP r19,-$SIZE_T*13($ap)
+ $POP r20,-$SIZE_T*12($ap)
+ $POP r21,-$SIZE_T*11($ap)
+ $POP r22,-$SIZE_T*10($ap)
+ $POP r23,-$SIZE_T*9($ap)
+ $POP r24,-$SIZE_T*8($ap)
+ $POP r25,-$SIZE_T*7($ap)
+ $POP r26,-$SIZE_T*6($ap)
+ $POP r27,-$SIZE_T*5($ap)
+ $POP r28,-$SIZE_T*4($ap)
+ $POP r29,-$SIZE_T*3($ap)
+ $POP r30,-$SIZE_T*2($ap)
+ $POP r31,-$SIZE_T*1($ap)
+ mr $sp,$ap
+ blr
+ .long 0
+ .byte 0,12,4,0x20,0x80,18,6,0
+ .long 0
+.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+___
+}
$code.=<<___;
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___