diff options
author | Ralf S. Engelschall <rse@openssl.org> | 1998-12-21 11:00:56 +0000 |
---|---|---|
committer | Ralf S. Engelschall <rse@openssl.org> | 1998-12-21 11:00:56 +0000 |
commit | dfeab0689f69c0b4bd3480ffd37a9cacc2f17d9c (patch) | |
tree | 2f74e0cfd76a9e092548a9bf52e579aef984299b /crypto/bn | |
parent | 58964a492275ca9a59a0cd9c8155cb2491b4b909 (diff) | |
download | openssl-dfeab0689f69c0b4bd3480ffd37a9cacc2f17d9c.tar.gz |
Import of old SSLeay release: SSLeay 0.9.1b (unreleased)
Diffstat (limited to 'crypto/bn')
100 files changed, 25588 insertions, 972 deletions
diff --git a/crypto/bn/DSA b/crypto/bn/DSA new file mode 100644 index 0000000000..83f257c84f --- /dev/null +++ b/crypto/bn/DSA @@ -0,0 +1,2 @@ +DSA wants 64*32 to use word mont mul, but +RSA wants to use full. diff --git a/crypto/bn/Makefile.ssl b/crypto/bn/Makefile.ssl index 9809d26cbc..0a365fca6a 100644 --- a/crypto/bn/Makefile.ssl +++ b/crypto/bn/Makefile.ssl @@ -13,9 +13,9 @@ MAKEDEPEND= makedepend -f Makefile.ssl MAKEFILE= Makefile.ssl AR= ar r -BN_MULW= bn_mulw.o +BN_ASM= bn_asm.o # or use -#BN_MULW= bn86-elf.o +#BN_ASM= bn86-elf.o CFLAGS= $(INCLUDES) $(CFLAG) @@ -26,16 +26,15 @@ TEST=bntest.c exptest.c APPS= LIB=$(TOP)/libcrypto.a -LIBSRC= bn_add.c bn_div.c bn_exp.c bn_lib.c bn_mod.c bn_mul.c \ - bn_print.c bn_rand.c bn_shift.c bn_sub.c bn_word.c bn_blind.c \ - bn_gcd.c bn_prime.c $(ERRC).c bn_sqr.c bn_mulw.c bn_recp.c bn_mont.c \ - bn_mpi.c - -LIBOBJ= bn_add.o bn_div.o bn_exp.o bn_lib.o bn_mod.o bn_mul.o \ - bn_print.o bn_rand.o bn_shift.o bn_sub.o bn_word.o bn_blind.o \ - bn_gcd.o bn_prime.o $(ERRC).o bn_sqr.o $(BN_MULW) bn_recp.o bn_mont.o \ - bn_mpi.o +LIBSRC= bn_add.c bn_div.c bn_exp.c bn_lib.c bn_mul.c \ + bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \ + bn_gcd.c bn_prime.c $(ERRC).c bn_sqr.c bn_asm.c bn_recp.c bn_mont.c \ + bn_mpi.c bn_exp2.c +LIBOBJ= bn_add.o bn_div.o bn_exp.o bn_lib.o bn_mul.o \ + bn_print.o bn_rand.o bn_shift.o bn_word.o bn_blind.o \ + bn_gcd.o bn_prime.o $(ERRC).o bn_sqr.o $(BN_ASM) bn_recp.o bn_mont.o \ + bn_mpi.o bn_exp2.o SRC= $(LIBSRC) @@ -65,23 +64,48 @@ lib: $(LIBOBJ) asm/bn86-elf.o: asm/bn86unix.cpp $(CPP) -DELF asm/bn86unix.cpp | as -o asm/bn86-elf.o +asm/co86-elf.o: asm/co86unix.cpp + $(CPP) -DELF asm/co86unix.cpp | as -o asm/co86-elf.o + # solaris asm/bn86-sol.o: asm/bn86unix.cpp $(CC) -E -DSOL asm/bn86unix.cpp | sed 's/^#.*//' > asm/bn86-sol.s as -o asm/bn86-sol.o asm/bn86-sol.s rm -f asm/bn86-sol.s +asm/co86-sol.o: asm/co86unix.cpp + $(CC) -E -DSOL asm/co86unix.cpp | sed 's/^#.*//' > asm/co86-sol.s + as -o asm/co86-sol.o asm/co86-sol.s + rm -f asm/co86-sol.s + # a.out asm/bn86-out.o: asm/bn86unix.cpp $(CPP) -DOUT asm/bn86unix.cpp | as -o asm/bn86-out.o +asm/co86-out.o: asm/co86unix.cpp + $(CPP) -DOUT asm/co86unix.cpp | as -o asm/co86-out.o + # bsdi asm/bn86bsdi.o: asm/bn86unix.cpp - $(CPP) -DBSDI asm/bn86unix.cpp | as -o asm/bn86bsdi.o + $(CPP) -DBSDI asm/bn86unix.cpp | sed 's/ :/:/' | as -o asm/bn86bsdi.o + +asm/co86bsdi.o: asm/co86unix.cpp + $(CPP) -DBSDI asm/co86unix.cpp | sed 's/ :/:/' | as -o asm/co86bsdi.o asm/bn86unix.cpp: (cd asm; perl bn-586.pl cpp >bn86unix.cpp ) +asm/co86unix.cpp: + (cd asm; perl co-586.pl cpp >co86unix.cpp ) + +# MIPS 64 bit assember +asm/mips3.o: asm/mips3.s + /usr/bin/as -mips3 -O2 -o asm/mips3.o asm/mips3.s + +# MIPS 32 bit assember +asm/mips1.o: asm/mips1.s + /usr/bin/as -O2 -o asm/mips1.o asm/mips1.s + files: perl $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO @@ -123,7 +147,7 @@ dclean: mv -f Makefile.new $(MAKEFILE) clean: - /bin/rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff bn_mulw.s + /bin/rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff bn_asm.s errors: perl $(TOP)/util/err-ins.pl $(ERR).err $(ERR).org # special case .org diff --git a/crypto/bn/alpha.s b/crypto/bn/alpha.s new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/crypto/bn/alpha.s diff --git a/crypto/bn/asm/a.out b/crypto/bn/asm/a.out Binary files differnew file mode 100644 index 0000000000..cc5094ff45 --- /dev/null +++ b/crypto/bn/asm/a.out diff --git a/crypto/bn/asm/alpha.s b/crypto/bn/asm/alpha.s index 1d17b1d619..cf0b69cff9 100644 --- a/crypto/bn/asm/alpha.s +++ b/crypto/bn/asm/alpha.s @@ -2,7 +2,13 @@ # The bn_div64 is actually gcc output but the other parts are hand done. # Thanks to tzeruch@ceddec.com for sending me the gcc output for # bn_div64. - .file 1 "bn_mulw.c" + # I've gone back and re-done most of routines. + # The key thing to remeber for the 164 CPU is that while a + # multiply operation takes 8 cycles, another one can only be issued + # after 4 cycles have elapsed. I've done modification to help + # improve this. Also, normally, a ld instruction will not be available + # for about 3 cycles. + .file 1 "bn_asm.c" .set noat gcc2_compiled.: __gnu_compiled_c: @@ -14,65 +20,91 @@ bn_mul_add_words: bn_mul_add_words..ng: .frame $30,0,$26,0 .prologue 0 - subq $18,2,$25 # num=-2 - bis $31,$31,$0 - blt $25,$42 .align 5 -$142: - subq $18,2,$18 # num-=2 - subq $25,2,$25 # num-=2 - - ldq $1,0($17) # a[0] - ldq $2,8($17) # a[1] - - mulq $19,$1,$3 # a[0]*w low part r3 - umulh $19,$1,$1 # a[0]*w high part r1 - mulq $19,$2,$4 # a[1]*w low part r4 - umulh $19,$2,$2 # a[1]*w high part r2 - - ldq $22,0($16) # r[0] r22 - ldq $23,8($16) # r[1] r23 - - addq $3,$22,$3 # a0 low part + r[0] - addq $4,$23,$4 # a1 low part + r[1] - cmpult $3,$22,$5 # overflow? - cmpult $4,$23,$6 # overflow? - addq $5,$1,$1 # high part + overflow - addq $6,$2,$2 # high part + overflow - - addq $3,$0,$3 # add c - cmpult $3,$0,$5 # overflow? - stq $3,0($16) - addq $5,$1,$0 # c=high part + overflow - - addq $4,$0,$4 # add c - cmpult $4,$0,$5 # overflow? - stq $4,8($16) - addq $5,$2,$0 # c=high part + overflow + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 + .align 3 +$42: + mulq $20,$19,$5 # 1 2 1 ###### + ldq $21,8($17) # 2 1 + ldq $2,8($16) # 2 1 + umulh $20,$19,$20 # 1 2 ###### + ldq $27,16($17) # 3 1 + ldq $3,16($16) # 3 1 + mulq $21,$19,$6 # 2 2 1 ###### + ldq $28,24($17) # 4 1 + addq $1,$5,$1 # 1 2 2 + ldq $4,24($16) # 4 1 + umulh $21,$19,$21 # 2 2 ###### + cmpult $1,$5,$22 # 1 2 3 1 + addq $20,$22,$20 # 1 3 1 + addq $1,$0,$1 # 1 2 3 1 + mulq $27,$19,$7 # 3 2 1 ###### + cmpult $1,$0,$0 # 1 2 3 2 + addq $2,$6,$2 # 2 2 2 + addq $20,$0,$0 # 1 3 2 + cmpult $2,$6,$23 # 2 2 3 1 + addq $21,$23,$21 # 2 3 1 + umulh $27,$19,$27 # 3 2 ###### + addq $2,$0,$2 # 2 2 3 1 + cmpult $2,$0,$0 # 2 2 3 2 + subq $18,4,$18 + mulq $28,$19,$8 # 4 2 1 ###### + addq $21,$0,$0 # 2 3 2 + addq $3,$7,$3 # 3 2 2 + addq $16,32,$16 + cmpult $3,$7,$24 # 3 2 3 1 + stq $1,-32($16) # 1 2 4 + umulh $28,$19,$28 # 4 2 ###### + addq $27,$24,$27 # 3 3 1 + addq $3,$0,$3 # 3 2 3 1 + stq $2,-24($16) # 2 2 4 + cmpult $3,$0,$0 # 3 2 3 2 + stq $3,-16($16) # 3 2 4 + addq $4,$8,$4 # 4 2 2 + addq $27,$0,$0 # 3 3 2 + cmpult $4,$8,$25 # 4 2 3 1 + addq $17,32,$17 + addq $28,$25,$28 # 4 3 1 + addq $4,$0,$4 # 4 2 3 1 + cmpult $4,$0,$0 # 4 2 3 2 + stq $4,-8($16) # 4 2 4 + addq $28,$0,$0 # 4 3 2 + blt $18,$43 - ble $18,$43 + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 - addq $16,16,$16 - addq $17,16,$17 - blt $25,$42 + br $42 - br $31,$142 -$42: - ldq $1,0($17) # a[0] - umulh $19,$1,$3 # a[0]*w high part - mulq $19,$1,$1 # a[0]*w low part - ldq $2,0($16) # r[0] - addq $1,$2,$1 # low part + r[0] - cmpult $1,$2,$4 # overflow? - addq $4,$3,$3 # high part + overflow - addq $1,$0,$1 # add c - cmpult $1,$0,$4 # overflow? - addq $4,$3,$0 # c=high part + overflow - stq $1,0($16) + .align 4 +$45: + ldq $20,0($17) # 4 1 + ldq $1,0($16) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + addq $16,8,$16 + addq $17,8,$17 + umulh $20,$19,$20 # 4 2 + addq $1,$5,$1 # 4 2 2 + cmpult $1,$5,$22 # 4 2 3 1 + addq $20,$22,$20 # 4 3 1 + addq $1,$0,$1 # 4 2 3 1 + cmpult $1,$0,$0 # 4 2 3 2 + addq $20,$0,$0 # 4 3 2 + stq $1,-8($16) # 4 2 4 + bgt $18,$45 + ret $31,($26),1 # else exit .align 4 $43: - ret $31,($26),1 + addq $18,4,$18 + bgt $18,$45 # goto tail code + ret $31,($26),1 # else exit + .end bn_mul_add_words .align 3 .globl bn_mul_words @@ -81,49 +113,75 @@ bn_mul_words: bn_mul_words..ng: .frame $30,0,$26,0 .prologue 0 - subq $18,2,$25 # num=-2 - bis $31,$31,$0 - blt $25,$242 .align 5 -$342: - subq $18,2,$18 # num-=2 - subq $25,2,$25 # num-=2 - - ldq $1,0($17) # a[0] - ldq $2,8($17) # a[1] - - mulq $19,$1,$3 # a[0]*w low part r3 - umulh $19,$1,$1 # a[0]*w high part r1 - mulq $19,$2,$4 # a[1]*w low part r4 - umulh $19,$2,$2 # a[1]*w high part r2 - - addq $3,$0,$3 # add c - cmpult $3,$0,$5 # overflow? - stq $3,0($16) - addq $5,$1,$0 # c=high part + overflow - - addq $4,$0,$4 # add c - cmpult $4,$0,$5 # overflow? - stq $4,8($16) - addq $5,$2,$0 # c=high part + overflow - - ble $18,$243 - - addq $16,16,$16 - addq $17,16,$17 - blt $25,$242 - - br $31,$342 -$242: - ldq $1,0($17) # a[0] - umulh $19,$1,$3 # a[0]*w high part - mulq $19,$1,$1 # a[0]*w low part - addq $1,$0,$1 # add c - cmpult $1,$0,$4 # overflow? - addq $4,$3,$0 # c=high part + overflow - stq $1,0($16) -$243: - ret $31,($26),1 + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$142: + + mulq $20,$19,$5 # 1 2 1 ##### + ldq $21,8($17) # 2 1 + ldq $27,16($17) # 3 1 + umulh $20,$19,$20 # 1 2 ##### + ldq $28,24($17) # 4 1 + mulq $21,$19,$6 # 2 2 1 ##### + addq $5,$0,$5 # 1 2 3 1 + subq $18,4,$18 + cmpult $5,$0,$0 # 1 2 3 2 + umulh $21,$19,$21 # 2 2 ##### + addq $20,$0,$0 # 1 3 2 + addq $17,32,$17 + addq $6,$0,$6 # 2 2 3 1 + mulq $27,$19,$7 # 3 2 1 ##### + cmpult $6,$0,$0 # 2 2 3 2 + addq $21,$0,$0 # 2 3 2 + addq $16,32,$16 + umulh $27,$19,$27 # 3 2 ##### + stq $5,-32($16) # 1 2 4 + mulq $28,$19,$8 # 4 2 1 ##### + addq $7,$0,$7 # 3 2 3 1 + stq $6,-24($16) # 2 2 4 + cmpult $7,$0,$0 # 3 2 3 2 + umulh $28,$19,$28 # 4 2 ##### + addq $27,$0,$0 # 3 3 2 + stq $7,-16($16) # 3 2 4 + addq $8,$0,$8 # 4 2 3 1 + cmpult $8,$0,$0 # 4 2 3 2 + + addq $28,$0,$0 # 4 3 2 + + stq $8,-8($16) # 4 2 4 + + blt $18,$143 + + ldq $20,0($17) # 1 1 + + br $142 + + .align 4 +$145: + ldq $20,0($17) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + umulh $20,$19,$20 # 4 2 + addq $5,$0,$5 # 4 2 3 1 + addq $16,8,$16 + cmpult $5,$0,$0 # 4 2 3 2 + addq $17,8,$17 + addq $20,$0,$0 # 4 3 2 + stq $5,-8($16) # 4 2 4 + + bgt $18,$145 + ret $31,($26),1 # else exit + + .align 4 +$143: + addq $18,4,$18 + bgt $18,$145 # goto tail code + ret $31,($26),1 # else exit + .end bn_mul_words .align 3 .globl bn_sqr_words @@ -132,44 +190,58 @@ bn_sqr_words: bn_sqr_words..ng: .frame $30,0,$26,0 .prologue 0 - - subq $18,2,$25 # num=-2 - blt $25,$442 - .align 5 -$542: - subq $18,2,$18 # num-=2 - subq $25,2,$25 # num-=2 - - ldq $1,0($17) # a[0] - ldq $4,8($17) # a[1] - mulq $1,$1,$2 # a[0]*w low part r2 - umulh $1,$1,$3 # a[0]*w high part r3 - mulq $4,$4,$5 # a[1]*w low part r5 - umulh $4,$4,$6 # a[1]*w high part r6 - - stq $2,0($16) # r[0] - stq $3,8($16) # r[1] - stq $5,16($16) # r[3] - stq $6,24($16) # r[4] + subq $18,4,$18 + blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$542: + mulq $20,$20,$5 ###### + ldq $21,8($17) # 1 1 + subq $18,4 + umulh $20,$20,$1 ###### + ldq $27,16($17) # 1 1 + mulq $21,$21,$6 ###### + ldq $28,24($17) # 1 1 + stq $5,0($16) # r[0] + umulh $21,$21,$2 ###### + stq $1,8($16) # r[1] + mulq $27,$27,$7 ###### + stq $6,16($16) # r[0] + umulh $27,$27,$3 ###### + stq $2,24($16) # r[1] + mulq $28,$28,$8 ###### + stq $7,32($16) # r[0] + umulh $28,$28,$4 ###### + stq $3,40($16) # r[1] - ble $18,$443 + addq $16,64,$16 + addq $17,32,$17 + stq $8,-16($16) # r[0] + stq $4,-8($16) # r[1] - addq $16,32,$16 - addq $17,16,$17 - blt $25,$442 - br $31,$542 + blt $18,$543 + ldq $20,0($17) # 1 1 + br $542 $442: - ldq $1,0($17) # a[0] - mulq $1,$1,$2 # a[0]*w low part r2 - umulh $1,$1,$3 # a[0]*w high part r3 - stq $2,0($16) # r[0] - stq $3,8($16) # r[1] + ldq $20,0($17) # a[0] + mulq $20,$20,$5 # a[0]*w low part r2 + addq $16,16,$16 + addq $17,8,$17 + subq $18,1,$18 + umulh $20,$20,$1 # a[0]*w high part r3 + stq $5,-16($16) # r[0] + stq $1,-8($16) # r[1] + + bgt $18,$442 + ret $31,($26),1 # else exit .align 4 -$443: - ret $31,($26),1 +$543: + addq $18,4,$18 + bgt $18,$442 # goto tail code + ret $31,($26),1 # else exit .end bn_sqr_words .align 3 @@ -180,31 +252,74 @@ bn_add_words..ng: .frame $30,0,$26,0 .prologue 0 - bis $31,$31,$8 # carry = 0 - ble $19,$900 + subq $19,4,$19 + bis $31,$31,$0 # carry = 0 + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + .align 3 $901: - ldq $0,0($17) # a[0] - ldq $1,0($18) # a[1] + addq $1,$5,$1 # r=a+b; + ldq $6,8($17) # a[1] + cmpult $1,$5,$22 # did we overflow? + ldq $2,8($18) # b[1] + addq $1,$0,$1 # c+= overflow + ldq $7,16($17) # a[2] + cmpult $1,$0,$0 # overflow? + ldq $3,16($18) # b[2] + addq $0,$22,$0 + ldq $8,24($17) # a[3] + addq $2,$6,$2 # r=a+b; + ldq $4,24($18) # b[3] + cmpult $2,$6,$23 # did we overflow? + addq $3,$7,$3 # r=a+b; + addq $2,$0,$2 # c+= overflow + cmpult $3,$7,$24 # did we overflow? + cmpult $2,$0,$0 # overflow? + addq $4,$8,$4 # r=a+b; + addq $0,$23,$0 + cmpult $4,$8,$25 # did we overflow? + addq $3,$0,$3 # c+= overflow + stq $1,0($16) # r[0]=c + cmpult $3,$0,$0 # overflow? + stq $2,8($16) # r[1]=c + addq $0,$24,$0 + stq $3,16($16) # r[2]=c + addq $4,$0,$4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $4,$0,$0 # overflow? + addq $17,32,$17 # a++ + addq $0,$25,$0 + stq $4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ - addq $0,$1,$3 # c=a+b; + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + br $901 + .align 4 +$945: + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + addq $1,$5,$1 # r=a+b; + subq $19,1,$19 # loop-- + addq $1,$0,$1 # c+= overflow addq $17,8,$17 # a++ + cmpult $1,$5,$22 # did we overflow? + cmpult $1,$0,$0 # overflow? + addq $18,8,$18 # b++ + stq $1,0($16) # r[0]=c + addq $0,$22,$0 + addq $16,8,$16 # r++ - cmpult $3,$1,$7 # did we overflow? - addq $18,8,$18 # b++ - - addq $8,$3,$3 # c+=carry + bgt $19,$945 + ret $31,($26),1 # else exit - cmpult $3,$8,$8 # did we overflow? - stq $3,($16) # r[0]=c - - addq $7,$8,$8 # add into overflow - subq $19,1,$19 # loop-- - - addq $16,8,$16 # r++ - bgt $19,$901 $900: - bis $8,$8,$0 # return carry - ret $31,($26),1 + addq $19,4,$19 + bgt $19,$945 # goto tail code + ret $31,($26),1 # else exit .end bn_add_words # @@ -339,6 +454,1445 @@ $136: addq $30,48,$30 ret $31,($26),1 .end bn_div64 - .ident "GCC: (GNU) 2.7.2.1" + .set noat + .text + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) +$101: + ldq $3, 8($17) + cmpult $1, $2, $4 + ldq $5, 8($18) + subq $1, $2, $1 + ldq $6, 16($17) + cmpult $1, $0, $2 + ldq $7, 16($18) + subq $1, $0, $23 + ldq $8, 24($17) + addq $2, $4, $0 + cmpult $3, $5, $24 + subq $3, $5, $3 + ldq $22, 24($18) + cmpult $3, $0, $5 + subq $3, $0, $25 + addq $5, $24, $0 + cmpult $6, $7, $27 + subq $6, $7, $6 + stq $23, 0($16) + cmpult $6, $0, $7 + subq $6, $0, $28 + addq $7, $27, $0 + cmpult $8, $22, $21 + subq $8, $22, $8 + stq $25, 8($16) + cmpult $8, $0, $22 + subq $8, $0, $20 + addq $22, $21, $0 + stq $28, 16($16) + subq $19, 4, $19 + stq $20, 24($16) + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) + br $101 +$102: + ldq $1, 0($17) + ldq $2, 0($18) + cmpult $1, $2, $27 + subq $1, $2, $1 + cmpult $1, $0, $2 + subq $1, $0, $1 + stq $1, 0($16) + addq $2, $27, $0 + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $102 + ret $31,($26),1 +$100: + addq $19, 4, $19 + bgt $19, $102 +$103: + ret $31,($26),1 + .end bn_sub_words + .text + .align 3 + .globl bn_mul_comba4 + .ent bn_mul_comba4 +bn_mul_comba4: +bn_mul_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + bis $31, $31, $23 + mulq $0, $1, $8 + umulh $0, $1, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $0, $3, $24 + umulh $0, $3, $25 + addq $22, $24, $22 + cmpult $22, $24, $27 + addq $27, $25, $25 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $8, $28, $8 + mulq $2, $1, $21 + umulh $2, $1, $20 + addq $22, $21, $22 + cmpult $22, $21, $19 + addq $19, $20, $20 + addq $23, $20, $23 + cmpult $23, $20, $17 + addq $8, $17, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $2, $3, $18 + umulh $2, $3, $24 + addq $23, $18, $23 + cmpult $23, $18, $27 + addq $27, $24, $24 + addq $8, $24, $8 + cmpult $8, $24, $25 + addq $22, $25, $22 + mulq $0, $5, $28 + umulh $0, $5, $21 + addq $23, $28, $23 + cmpult $23, $28, $19 + addq $19, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + mulq $4, $1, $17 + umulh $4, $1, $18 + addq $23, $17, $23 + cmpult $23, $17, $27 + addq $27, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $24 + addq $22, $24, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $0, $7, $25 + umulh $0, $7, $28 + addq $8, $25, $8 + cmpult $8, $25, $19 + addq $19, $28, $28 + addq $22, $28, $22 + cmpult $22, $28, $21 + addq $23, $21, $23 + mulq $2, $5, $20 + umulh $2, $5, $17 + addq $8, $20, $8 + cmpult $8, $20, $27 + addq $27, $17, $17 + addq $22, $17, $22 + cmpult $22, $17, $18 + addq $23, $18, $23 + mulq $4, $3, $24 + umulh $4, $3, $25 + addq $8, $24, $8 + cmpult $8, $24, $19 + addq $19, $25, $25 + addq $22, $25, $22 + cmpult $22, $25, $28 + addq $23, $28, $23 + mulq $6, $1, $21 + umulh $6, $1, $0 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $20, $0, $0 + addq $22, $0, $22 + cmpult $22, $0, $27 + addq $23, $27, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $7, $17 + umulh $2, $7, $18 + addq $22, $17, $22 + cmpult $22, $17, $24 + addq $24, $18, $18 + addq $23, $18, $23 + cmpult $23, $18, $19 + addq $8, $19, $8 + mulq $4, $5, $25 + umulh $4, $5, $28 + addq $22, $25, $22 + cmpult $22, $25, $21 + addq $21, $28, $28 + addq $23, $28, $23 + cmpult $23, $28, $20 + addq $8, $20, $8 + mulq $6, $3, $0 + umulh $6, $3, $27 + addq $22, $0, $22 + cmpult $22, $0, $1 + addq $1, $27, $27 + addq $23, $27, $23 + cmpult $23, $27, $17 + addq $8, $17, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $4, $7, $24 + umulh $4, $7, $18 + addq $23, $24, $23 + cmpult $23, $24, $19 + addq $19, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $2 + addq $22, $2, $22 + mulq $6, $5, $25 + umulh $6, $5, $21 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $28, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $6, $7, $0 + umulh $6, $7, $1 + addq $8, $0, $8 + cmpult $8, $0, $27 + addq $27, $1, $1 + addq $22, $1, $22 + cmpult $22, $1, $17 + addq $23, $17, $23 + stq $8, 48($16) + stq $22, 56($16) + ret $31,($26),1 + .end bn_mul_comba4 + .text + .align 3 + .globl bn_mul_comba8 + .ent bn_mul_comba8 +bn_mul_comba8: +bn_mul_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $30, 16, $30 + ldq $0, 0($17) + ldq $1, 0($18) + stq $9, 0($30) + stq $10, 8($30) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + ldq $8, 8($17) + ldq $22, 8($18) + ldq $23, 8($17) + ldq $24, 8($18) + ldq $25, 8($17) + ldq $27, 8($18) + ldq $28, 8($17) + ldq $21, 8($18) + bis $31, $31, $9 + mulq $0, $1, $20 + umulh $0, $1, $19 + stq $20, 0($16) + bis $31, $31, $20 + mulq $0, $3, $10 + umulh $0, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $2, $1, $18 + umulh $2, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 8($16) + bis $31, $31, $19 + mulq $0, $5, $10 + umulh $0, $5, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $3, $18 + umulh $2, $3, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $1, $10 + umulh $4, $1, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 16($16) + bis $31, $31, $9 + mulq $0, $7, $18 + umulh $0, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $2, $5, $10 + umulh $2, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $4, $3, $18 + umulh $4, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $6, $1, $10 + umulh $6, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 24($16) + bis $31, $31, $20 + mulq $0, $22, $18 + umulh $0, $22, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $7, $10 + umulh $2, $7, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $4, $5, $18 + umulh $4, $5, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $6, $3, $10 + umulh $6, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $8, $1, $18 + umulh $8, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 32($16) + bis $31, $31, $19 + mulq $0, $24, $10 + umulh $0, $24, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $22, $18 + umulh $2, $22, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $7, $10 + umulh $4, $7, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $6, $5, $18 + umulh $6, $5, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $8, $3, $10 + umulh $8, $3, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $23, $1, $18 + umulh $23, $1, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + stq $9, 40($16) + bis $31, $31, $9 + mulq $0, $27, $10 + umulh $0, $27, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $2, $24, $18 + umulh $2, $24, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $4, $22, $10 + umulh $4, $22, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $6, $7, $18 + umulh $6, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $8, $5, $10 + umulh $8, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $23, $3, $18 + umulh $23, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $25, $1, $10 + umulh $25, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 48($16) + bis $31, $31, $20 + mulq $0, $21, $18 + umulh $0, $21, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $27, $10 + umulh $2, $27, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $4, $24, $10 + umulh $4, $24, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $6, $22, $10 + umulh $6, $22, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $8, $7, $10 + umulh $8, $7, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $23, $5, $10 + umulh $23, $5, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $25, $3, $10 + umulh $25, $3, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $28, $1, $10 + umulh $28, $1, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + stq $19, 56($16) + bis $31, $31, $19 + mulq $2, $21, $10 + umulh $2, $21, $18 + addq $9, $10, $9 + cmpult $9, $10, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $0 + addq $19, $0, $19 + mulq $4, $27, $1 + umulh $4, $27, $10 + addq $9, $1, $9 + cmpult $9, $1, $17 + addq $17, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $19, $18, $19 + mulq $6, $24, $0 + umulh $6, $24, $2 + addq $9, $0, $9 + cmpult $9, $0, $1 + addq $1, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $17 + addq $19, $17, $19 + mulq $8, $22, $10 + umulh $8, $22, $18 + addq $9, $10, $9 + cmpult $9, $10, $0 + addq $0, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $1 + addq $19, $1, $19 + mulq $23, $7, $2 + umulh $23, $7, $17 + addq $9, $2, $9 + cmpult $9, $2, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $0 + addq $19, $0, $19 + mulq $25, $5, $18 + umulh $25, $5, $1 + addq $9, $18, $9 + cmpult $9, $18, $2 + addq $2, $1, $1 + addq $20, $1, $20 + cmpult $20, $1, $10 + addq $19, $10, $19 + mulq $28, $3, $17 + umulh $28, $3, $0 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $18, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $19, $2, $19 + stq $9, 64($16) + bis $31, $31, $9 + mulq $4, $21, $1 + umulh $4, $21, $10 + addq $20, $1, $20 + cmpult $20, $1, $17 + addq $17, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $9, $18, $9 + mulq $6, $27, $0 + umulh $6, $27, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + mulq $8, $24, $17 + umulh $8, $24, $10 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $18, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $4 + addq $9, $4, $9 + mulq $23, $22, $0 + umulh $23, $22, $3 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $1 + addq $9, $1, $9 + mulq $25, $7, $17 + umulh $25, $7, $18 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $10, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $4 + addq $9, $4, $9 + mulq $28, $5, $0 + umulh $28, $5, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + stq $20, 72($16) + bis $31, $31, $20 + mulq $6, $21, $17 + umulh $6, $21, $10 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $18, $10, $10 + addq $9, $10, $9 + cmpult $9, $10, $4 + addq $20, $4, $20 + mulq $8, $27, $0 + umulh $8, $27, $3 + addq $19, $0, $19 + cmpult $19, $0, $2 + addq $2, $3, $3 + addq $9, $3, $9 + cmpult $9, $3, $1 + addq $20, $1, $20 + mulq $23, $24, $5 + umulh $23, $24, $17 + addq $19, $5, $19 + cmpult $19, $5, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $25, $22, $4 + umulh $25, $22, $6 + addq $19, $4, $19 + cmpult $19, $4, $0 + addq $0, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $2 + addq $20, $2, $20 + mulq $28, $7, $3 + umulh $28, $7, $1 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $5, $1, $1 + addq $9, $1, $9 + cmpult $9, $1, $18 + addq $20, $18, $20 + stq $19, 80($16) + bis $31, $31, $19 + mulq $8, $21, $17 + umulh $8, $21, $10 + addq $9, $17, $9 + cmpult $9, $17, $4 + addq $4, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $0 + addq $19, $0, $19 + mulq $23, $27, $6 + umulh $23, $27, $2 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $3, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $5 + addq $19, $5, $19 + mulq $25, $24, $1 + umulh $25, $24, $18 + addq $9, $1, $9 + cmpult $9, $1, $7 + addq $7, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $19, $17, $19 + mulq $28, $22, $4 + umulh $28, $22, $10 + addq $9, $4, $9 + cmpult $9, $4, $0 + addq $0, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $8 + addq $19, $8, $19 + stq $9, 88($16) + bis $31, $31, $9 + mulq $23, $21, $6 + umulh $23, $21, $3 + addq $20, $6, $20 + cmpult $20, $6, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $9, $5, $9 + mulq $25, $27, $1 + umulh $25, $27, $7 + addq $20, $1, $20 + cmpult $20, $1, $18 + addq $18, $7, $7 + addq $19, $7, $19 + cmpult $19, $7, $17 + addq $9, $17, $9 + mulq $28, $24, $4 + umulh $28, $24, $0 + addq $20, $4, $20 + cmpult $20, $4, $10 + addq $10, $0, $0 + addq $19, $0, $19 + cmpult $19, $0, $8 + addq $9, $8, $9 + stq $20, 96($16) + bis $31, $31, $20 + mulq $25, $21, $22 + umulh $25, $21, $6 + addq $19, $22, $19 + cmpult $19, $22, $2 + addq $2, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $20, $3, $20 + mulq $28, $27, $5 + umulh $28, $27, $23 + addq $19, $5, $19 + cmpult $19, $5, $1 + addq $1, $23, $23 + addq $9, $23, $9 + cmpult $9, $23, $18 + addq $20, $18, $20 + stq $19, 104($16) + bis $31, $31, $19 + mulq $28, $21, $7 + umulh $28, $21, $17 + addq $9, $7, $9 + cmpult $9, $7, $4 + addq $4, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 112($16) + stq $20, 120($16) + ldq $9, 0($30) + ldq $10, 8($30) + addq $30, 16, $30 + ret $31,($26),1 + .end bn_mul_comba8 + .text + .align 3 + .globl bn_sqr_comba4 + .ent bn_sqr_comba4 +bn_sqr_comba4: +bn_sqr_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + bis $31, $31, $6 + mulq $0, $0, $4 + umulh $0, $0, $5 + stq $4, 0($16) + bis $31, $31, $4 + mulq $0, $1, $7 + umulh $0, $1, $8 + cmplt $7, $31, $22 + cmplt $8, $31, $23 + addq $7, $7, $7 + addq $8, $8, $8 + addq $8, $22, $8 + addq $4, $23, $4 + addq $5, $7, $5 + addq $6, $8, $6 + cmpult $5, $7, $24 + cmpult $6, $8, $25 + addq $6, $24, $6 + addq $4, $25, $4 + stq $5, 8($16) + bis $31, $31, $5 + mulq $1, $1, $27 + umulh $1, $1, $28 + addq $6, $27, $6 + addq $4, $28, $4 + cmpult $6, $27, $21 + cmpult $4, $28, $20 + addq $4, $21, $4 + addq $5, $20, $5 + mulq $2, $0, $19 + umulh $2, $0, $18 + cmplt $19, $31, $17 + cmplt $18, $31, $22 + addq $19, $19, $19 + addq $18, $18, $18 + addq $18, $17, $18 + addq $5, $22, $5 + addq $6, $19, $6 + addq $4, $18, $4 + cmpult $6, $19, $23 + cmpult $4, $18, $7 + addq $4, $23, $4 + addq $5, $7, $5 + stq $6, 16($16) + bis $31, $31, $6 + mulq $3, $0, $8 + umulh $3, $0, $24 + cmplt $8, $31, $25 + cmplt $24, $31, $27 + addq $8, $8, $8 + addq $24, $24, $24 + addq $24, $25, $24 + addq $6, $27, $6 + addq $4, $8, $4 + addq $5, $24, $5 + cmpult $4, $8, $28 + cmpult $5, $24, $21 + addq $5, $28, $5 + addq $6, $21, $6 + mulq $2, $1, $20 + umulh $2, $1, $17 + cmplt $20, $31, $22 + cmplt $17, $31, $19 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $22, $17 + addq $6, $19, $6 + addq $4, $20, $4 + addq $5, $17, $5 + cmpult $4, $20, $18 + cmpult $5, $17, $23 + addq $5, $18, $5 + addq $6, $23, $6 + stq $4, 24($16) + bis $31, $31, $4 + mulq $2, $2, $7 + umulh $2, $2, $25 + addq $5, $7, $5 + addq $6, $25, $6 + cmpult $5, $7, $27 + cmpult $6, $25, $8 + addq $6, $27, $6 + addq $4, $8, $4 + mulq $3, $1, $24 + umulh $3, $1, $28 + cmplt $24, $31, $21 + cmplt $28, $31, $22 + addq $24, $24, $24 + addq $28, $28, $28 + addq $28, $21, $28 + addq $4, $22, $4 + addq $5, $24, $5 + addq $6, $28, $6 + cmpult $5, $24, $19 + cmpult $6, $28, $20 + addq $6, $19, $6 + addq $4, $20, $4 + stq $5, 32($16) + bis $31, $31, $5 + mulq $3, $2, $17 + umulh $3, $2, $18 + cmplt $17, $31, $23 + cmplt $18, $31, $7 + addq $17, $17, $17 + addq $18, $18, $18 + addq $18, $23, $18 + addq $5, $7, $5 + addq $6, $17, $6 + addq $4, $18, $4 + cmpult $6, $17, $25 + cmpult $4, $18, $27 + addq $4, $25, $4 + addq $5, $27, $5 + stq $6, 40($16) + bis $31, $31, $6 + mulq $3, $3, $8 + umulh $3, $3, $21 + addq $4, $8, $4 + addq $5, $21, $5 + cmpult $4, $8, $22 + cmpult $5, $21, $24 + addq $5, $22, $5 + addq $6, $24, $6 + stq $4, 48($16) + stq $5, 56($16) + ret $31,($26),1 + .end bn_sqr_comba4 + .text + .align 3 + .globl bn_sqr_comba8 + .ent bn_sqr_comba8 +bn_sqr_comba8: +bn_sqr_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + ldq $4, 32($17) + ldq $5, 40($17) + ldq $6, 48($17) + ldq $7, 56($17) + bis $31, $31, $23 + mulq $0, $0, $8 + umulh $0, $0, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $1, $0, $24 + umulh $1, $0, $25 + cmplt $24, $31, $27 + cmplt $25, $31, $28 + addq $24, $24, $24 + addq $25, $25, $25 + addq $25, $27, $25 + addq $8, $28, $8 + addq $22, $24, $22 + addq $23, $25, $23 + cmpult $22, $24, $21 + cmpult $23, $25, $20 + addq $23, $21, $23 + addq $8, $20, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $1, $1, $19 + umulh $1, $1, $18 + addq $23, $19, $23 + addq $8, $18, $8 + cmpult $23, $19, $17 + cmpult $8, $18, $27 + addq $8, $17, $8 + addq $22, $27, $22 + mulq $2, $0, $28 + umulh $2, $0, $24 + cmplt $28, $31, $25 + cmplt $24, $31, $21 + addq $28, $28, $28 + addq $24, $24, $24 + addq $24, $25, $24 + addq $22, $21, $22 + addq $23, $28, $23 + addq $8, $24, $8 + cmpult $23, $28, $20 + cmpult $8, $24, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $2, $1, $18 + umulh $2, $1, $17 + cmplt $18, $31, $27 + cmplt $17, $31, $25 + addq $18, $18, $18 + addq $17, $17, $17 + addq $17, $27, $17 + addq $23, $25, $23 + addq $8, $18, $8 + addq $22, $17, $22 + cmpult $8, $18, $21 + cmpult $22, $17, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $3, $0, $24 + umulh $3, $0, $20 + cmplt $24, $31, $19 + cmplt $20, $31, $27 + addq $24, $24, $24 + addq $20, $20, $20 + addq $20, $19, $20 + addq $23, $27, $23 + addq $8, $24, $8 + addq $22, $20, $22 + cmpult $8, $24, $25 + cmpult $22, $20, $18 + addq $22, $25, $22 + addq $23, $18, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $2, $17 + umulh $2, $2, $21 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $28 + cmpult $23, $21, $19 + addq $23, $28, $23 + addq $8, $19, $8 + mulq $3, $1, $27 + umulh $3, $1, $24 + cmplt $27, $31, $20 + cmplt $24, $31, $25 + addq $27, $27, $27 + addq $24, $24, $24 + addq $24, $20, $24 + addq $8, $25, $8 + addq $22, $27, $22 + addq $23, $24, $23 + cmpult $22, $27, $18 + cmpult $23, $24, $17 + addq $23, $18, $23 + addq $8, $17, $8 + mulq $4, $0, $21 + umulh $4, $0, $28 + cmplt $21, $31, $19 + cmplt $28, $31, $20 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $19, $28 + addq $8, $20, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $25 + cmpult $23, $28, $27 + addq $23, $25, $23 + addq $8, $27, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $3, $2, $24 + umulh $3, $2, $18 + cmplt $24, $31, $17 + cmplt $18, $31, $19 + addq $24, $24, $24 + addq $18, $18, $18 + addq $18, $17, $18 + addq $22, $19, $22 + addq $23, $24, $23 + addq $8, $18, $8 + cmpult $23, $24, $20 + cmpult $8, $18, $21 + addq $8, $20, $8 + addq $22, $21, $22 + mulq $4, $1, $28 + umulh $4, $1, $25 + cmplt $28, $31, $27 + cmplt $25, $31, $17 + addq $28, $28, $28 + addq $25, $25, $25 + addq $25, $27, $25 + addq $22, $17, $22 + addq $23, $28, $23 + addq $8, $25, $8 + cmpult $23, $28, $19 + cmpult $8, $25, $24 + addq $8, $19, $8 + addq $22, $24, $22 + mulq $5, $0, $18 + umulh $5, $0, $20 + cmplt $18, $31, $21 + cmplt $20, $31, $27 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $21, $20 + addq $22, $27, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $28 + addq $8, $17, $8 + addq $22, $28, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $3, $3, $25 + umulh $3, $3, $19 + addq $8, $25, $8 + addq $22, $19, $22 + cmpult $8, $25, $24 + cmpult $22, $19, $21 + addq $22, $24, $22 + addq $23, $21, $23 + mulq $4, $2, $27 + umulh $4, $2, $18 + cmplt $27, $31, $20 + cmplt $18, $31, $17 + addq $27, $27, $27 + addq $18, $18, $18 + addq $18, $20, $18 + addq $23, $17, $23 + addq $8, $27, $8 + addq $22, $18, $22 + cmpult $8, $27, $28 + cmpult $22, $18, $25 + addq $22, $28, $22 + addq $23, $25, $23 + mulq $5, $1, $19 + umulh $5, $1, $24 + cmplt $19, $31, $21 + cmplt $24, $31, $20 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $21, $24 + addq $23, $20, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $17 + cmpult $22, $24, $27 + addq $22, $17, $22 + addq $23, $27, $23 + mulq $6, $0, $18 + umulh $6, $0, $28 + cmplt $18, $31, $25 + cmplt $28, $31, $21 + addq $18, $18, $18 + addq $28, $28, $28 + addq $28, $25, $28 + addq $23, $21, $23 + addq $8, $18, $8 + addq $22, $28, $22 + cmpult $8, $18, $20 + cmpult $22, $28, $19 + addq $22, $20, $22 + addq $23, $19, $23 + stq $8, 48($16) + bis $31, $31, $8 + mulq $4, $3, $24 + umulh $4, $3, $17 + cmplt $24, $31, $27 + cmplt $17, $31, $25 + addq $24, $24, $24 + addq $17, $17, $17 + addq $17, $27, $17 + addq $8, $25, $8 + addq $22, $24, $22 + addq $23, $17, $23 + cmpult $22, $24, $21 + cmpult $23, $17, $18 + addq $23, $21, $23 + addq $8, $18, $8 + mulq $5, $2, $28 + umulh $5, $2, $20 + cmplt $28, $31, $19 + cmplt $20, $31, $27 + addq $28, $28, $28 + addq $20, $20, $20 + addq $20, $19, $20 + addq $8, $27, $8 + addq $22, $28, $22 + addq $23, $20, $23 + cmpult $22, $28, $25 + cmpult $23, $20, $24 + addq $23, $25, $23 + addq $8, $24, $8 + mulq $6, $1, $17 + umulh $6, $1, $21 + cmplt $17, $31, $18 + cmplt $21, $31, $19 + addq $17, $17, $17 + addq $21, $21, $21 + addq $21, $18, $21 + addq $8, $19, $8 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $27 + cmpult $23, $21, $28 + addq $23, $27, $23 + addq $8, $28, $8 + mulq $7, $0, $20 + umulh $7, $0, $25 + cmplt $20, $31, $24 + cmplt $25, $31, $18 + addq $20, $20, $20 + addq $25, $25, $25 + addq $25, $24, $25 + addq $8, $18, $8 + addq $22, $20, $22 + addq $23, $25, $23 + cmpult $22, $20, $19 + cmpult $23, $25, $17 + addq $23, $19, $23 + addq $8, $17, $8 + stq $22, 56($16) + bis $31, $31, $22 + mulq $4, $4, $21 + umulh $4, $4, $27 + addq $23, $21, $23 + addq $8, $27, $8 + cmpult $23, $21, $28 + cmpult $8, $27, $24 + addq $8, $28, $8 + addq $22, $24, $22 + mulq $5, $3, $18 + umulh $5, $3, $20 + cmplt $18, $31, $25 + cmplt $20, $31, $19 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $25, $20 + addq $22, $19, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $21 + addq $8, $17, $8 + addq $22, $21, $22 + mulq $6, $2, $27 + umulh $6, $2, $28 + cmplt $27, $31, $24 + cmplt $28, $31, $25 + addq $27, $27, $27 + addq $28, $28, $28 + addq $28, $24, $28 + addq $22, $25, $22 + addq $23, $27, $23 + addq $8, $28, $8 + cmpult $23, $27, $19 + cmpult $8, $28, $18 + addq $8, $19, $8 + addq $22, $18, $22 + mulq $7, $1, $20 + umulh $7, $1, $17 + cmplt $20, $31, $21 + cmplt $17, $31, $24 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $21, $17 + addq $22, $24, $22 + addq $23, $20, $23 + addq $8, $17, $8 + cmpult $23, $20, $25 + cmpult $8, $17, $27 + addq $8, $25, $8 + addq $22, $27, $22 + stq $23, 64($16) + bis $31, $31, $23 + mulq $5, $4, $28 + umulh $5, $4, $19 + cmplt $28, $31, $18 + cmplt $19, $31, $21 + addq $28, $28, $28 + addq $19, $19, $19 + addq $19, $18, $19 + addq $23, $21, $23 + addq $8, $28, $8 + addq $22, $19, $22 + cmpult $8, $28, $24 + cmpult $22, $19, $20 + addq $22, $24, $22 + addq $23, $20, $23 + mulq $6, $3, $17 + umulh $6, $3, $25 + cmplt $17, $31, $27 + cmplt $25, $31, $18 + addq $17, $17, $17 + addq $25, $25, $25 + addq $25, $27, $25 + addq $23, $18, $23 + addq $8, $17, $8 + addq $22, $25, $22 + cmpult $8, $17, $21 + cmpult $22, $25, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $7, $2, $19 + umulh $7, $2, $24 + cmplt $19, $31, $20 + cmplt $24, $31, $27 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $20, $24 + addq $23, $27, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $18 + cmpult $22, $24, $17 + addq $22, $18, $22 + addq $23, $17, $23 + stq $8, 72($16) + bis $31, $31, $8 + mulq $5, $5, $25 + umulh $5, $5, $21 + addq $22, $25, $22 + addq $23, $21, $23 + cmpult $22, $25, $28 + cmpult $23, $21, $20 + addq $23, $28, $23 + addq $8, $20, $8 + mulq $6, $4, $27 + umulh $6, $4, $19 + cmplt $27, $31, $24 + cmplt $19, $31, $18 + addq $27, $27, $27 + addq $19, $19, $19 + addq $19, $24, $19 + addq $8, $18, $8 + addq $22, $27, $22 + addq $23, $19, $23 + cmpult $22, $27, $17 + cmpult $23, $19, $25 + addq $23, $17, $23 + addq $8, $25, $8 + mulq $7, $3, $21 + umulh $7, $3, $28 + cmplt $21, $31, $20 + cmplt $28, $31, $24 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $20, $28 + addq $8, $24, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $18 + cmpult $23, $28, $27 + addq $23, $18, $23 + addq $8, $27, $8 + stq $22, 80($16) + bis $31, $31, $22 + mulq $6, $5, $19 + umulh $6, $5, $17 + cmplt $19, $31, $25 + cmplt $17, $31, $20 + addq $19, $19, $19 + addq $17, $17, $17 + addq $17, $25, $17 + addq $22, $20, $22 + addq $23, $19, $23 + addq $8, $17, $8 + cmpult $23, $19, $24 + cmpult $8, $17, $21 + addq $8, $24, $8 + addq $22, $21, $22 + mulq $7, $4, $28 + umulh $7, $4, $18 + cmplt $28, $31, $27 + cmplt $18, $31, $25 + addq $28, $28, $28 + addq $18, $18, $18 + addq $18, $27, $18 + addq $22, $25, $22 + addq $23, $28, $23 + addq $8, $18, $8 + cmpult $23, $28, $20 + cmpult $8, $18, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 88($16) + bis $31, $31, $23 + mulq $6, $6, $17 + umulh $6, $6, $24 + addq $8, $17, $8 + addq $22, $24, $22 + cmpult $8, $17, $21 + cmpult $22, $24, $27 + addq $22, $21, $22 + addq $23, $27, $23 + mulq $7, $5, $25 + umulh $7, $5, $28 + cmplt $25, $31, $18 + cmplt $28, $31, $20 + addq $25, $25, $25 + addq $28, $28, $28 + addq $28, $18, $28 + addq $23, $20, $23 + addq $8, $25, $8 + addq $22, $28, $22 + cmpult $8, $25, $19 + cmpult $22, $28, $17 + addq $22, $19, $22 + addq $23, $17, $23 + stq $8, 96($16) + bis $31, $31, $8 + mulq $7, $6, $24 + umulh $7, $6, $21 + cmplt $24, $31, $27 + cmplt $21, $31, $18 + addq $24, $24, $24 + addq $21, $21, $21 + addq $21, $27, $21 + addq $8, $18, $8 + addq $22, $24, $22 + addq $23, $21, $23 + cmpult $22, $24, $20 + cmpult $23, $21, $25 + addq $23, $20, $23 + addq $8, $25, $8 + stq $22, 104($16) + bis $31, $31, $22 + mulq $7, $7, $28 + umulh $7, $7, $19 + addq $23, $28, $23 + addq $8, $19, $8 + cmpult $23, $28, $17 + cmpult $8, $19, $27 + addq $8, $17, $8 + addq $22, $27, $22 + stq $23, 112($16) + stq $8, 120($16) + ret $31,($26),1 + .end bn_sqr_comba8 diff --git a/crypto/bn/asm/alpha.s.works b/crypto/bn/asm/alpha.s.works new file mode 100644 index 0000000000..ee6c587809 --- /dev/null +++ b/crypto/bn/asm/alpha.s.works @@ -0,0 +1,533 @@ + + # DEC Alpha assember + # The bn_div64 is actually gcc output but the other parts are hand done. + # Thanks to tzeruch@ceddec.com for sending me the gcc output for + # bn_div64. + # I've gone back and re-done most of routines. + # The key thing to remeber for the 164 CPU is that while a + # multiply operation takes 8 cycles, another one can only be issued + # after 4 cycles have elapsed. I've done modification to help + # improve this. Also, normally, a ld instruction will not be available + # for about 3 cycles. + .file 1 "bn_asm.c" + .set noat +gcc2_compiled.: +__gnu_compiled_c: + .text + .align 3 + .globl bn_mul_add_words + .ent bn_mul_add_words +bn_mul_add_words: +bn_mul_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 + .align 3 +$42: + mulq $20,$19,$5 # 1 2 1 ###### + ldq $21,8($17) # 2 1 + ldq $2,8($16) # 2 1 + umulh $20,$19,$20 # 1 2 ###### + ldq $27,16($17) # 3 1 + ldq $3,16($16) # 3 1 + mulq $21,$19,$6 # 2 2 1 ###### + ldq $28,24($17) # 4 1 + addq $1,$5,$1 # 1 2 2 + ldq $4,24($16) # 4 1 + umulh $21,$19,$21 # 2 2 ###### + cmpult $1,$5,$22 # 1 2 3 1 + addq $20,$22,$20 # 1 3 1 + addq $1,$0,$1 # 1 2 3 1 + mulq $27,$19,$7 # 3 2 1 ###### + cmpult $1,$0,$0 # 1 2 3 2 + addq $2,$6,$2 # 2 2 2 + addq $20,$0,$0 # 1 3 2 + cmpult $2,$6,$23 # 2 2 3 1 + addq $21,$23,$21 # 2 3 1 + umulh $27,$19,$27 # 3 2 ###### + addq $2,$0,$2 # 2 2 3 1 + cmpult $2,$0,$0 # 2 2 3 2 + subq $18,4,$18 + mulq $28,$19,$8 # 4 2 1 ###### + addq $21,$0,$0 # 2 3 2 + addq $3,$7,$3 # 3 2 2 + addq $16,32,$16 + cmpult $3,$7,$24 # 3 2 3 1 + stq $1,-32($16) # 1 2 4 + umulh $28,$19,$28 # 4 2 ###### + addq $27,$24,$27 # 3 3 1 + addq $3,$0,$3 # 3 2 3 1 + stq $2,-24($16) # 2 2 4 + cmpult $3,$0,$0 # 3 2 3 2 + stq $3,-16($16) # 3 2 4 + addq $4,$8,$4 # 4 2 2 + addq $27,$0,$0 # 3 3 2 + cmpult $4,$8,$25 # 4 2 3 1 + addq $17,32,$17 + addq $28,$25,$28 # 4 3 1 + addq $4,$0,$4 # 4 2 3 1 + cmpult $4,$0,$0 # 4 2 3 2 + stq $4,-8($16) # 4 2 4 + addq $28,$0,$0 # 4 3 2 + blt $18,$43 + + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 + + br $42 + + .align 4 +$45: + ldq $20,0($17) # 4 1 + ldq $1,0($16) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + addq $16,8,$16 + addq $17,8,$17 + umulh $20,$19,$20 # 4 2 + addq $1,$5,$1 # 4 2 2 + cmpult $1,$5,$22 # 4 2 3 1 + addq $20,$22,$20 # 4 3 1 + addq $1,$0,$1 # 4 2 3 1 + cmpult $1,$0,$0 # 4 2 3 2 + addq $20,$0,$0 # 4 3 2 + stq $1,-8($16) # 4 2 4 + bgt $18,$45 + ret $31,($26),1 # else exit + + .align 4 +$43: + addq $18,4,$18 + bgt $18,$45 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_add_words + .align 3 + .globl bn_mul_words + .ent bn_mul_words +bn_mul_words: +bn_mul_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$142: + + mulq $20,$19,$5 # 1 2 1 ##### + ldq $21,8($17) # 2 1 + ldq $27,16($17) # 3 1 + umulh $20,$19,$20 # 1 2 ##### + ldq $28,24($17) # 4 1 + mulq $21,$19,$6 # 2 2 1 ##### + addq $5,$0,$5 # 1 2 3 1 + subq $18,4,$18 + cmpult $5,$0,$0 # 1 2 3 2 + umulh $21,$19,$21 # 2 2 ##### + addq $20,$0,$0 # 1 3 2 + addq $17,32,$17 + addq $6,$0,$6 # 2 2 3 1 + mulq $27,$19,$7 # 3 2 1 ##### + cmpult $6,$0,$0 # 2 2 3 2 + addq $21,$0,$0 # 2 3 2 + addq $16,32,$16 + umulh $27,$19,$27 # 3 2 ##### + stq $5,-32($16) # 1 2 4 + mulq $28,$19,$8 # 4 2 1 ##### + addq $7,$0,$7 # 3 2 3 1 + stq $6,-24($16) # 2 2 4 + cmpult $7,$0,$0 # 3 2 3 2 + umulh $28,$19,$28 # 4 2 ##### + addq $27,$0,$0 # 3 3 2 + stq $7,-16($16) # 3 2 4 + addq $8,$0,$8 # 4 2 3 1 + cmpult $8,$0,$0 # 4 2 3 2 + + addq $28,$0,$0 # 4 3 2 + + stq $8,-8($16) # 4 2 4 + + blt $18,$143 + + ldq $20,0($17) # 1 1 + + br $142 + + .align 4 +$145: + ldq $20,0($17) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + umulh $20,$19,$20 # 4 2 + addq $5,$0,$5 # 4 2 3 1 + addq $16,8,$16 + cmpult $5,$0,$0 # 4 2 3 2 + addq $17,8,$17 + addq $20,$0,$0 # 4 3 2 + stq $5,-8($16) # 4 2 4 + + bgt $18,$145 + ret $31,($26),1 # else exit + + .align 4 +$143: + addq $18,4,$18 + bgt $18,$145 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_words + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18,4,$18 + blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$542: + mulq $20,$20,$5 ###### + ldq $21,8($17) # 1 1 + subq $18,4 + umulh $20,$20,$1 ###### + ldq $27,16($17) # 1 1 + mulq $21,$21,$6 ###### + ldq $28,24($17) # 1 1 + stq $5,0($16) # r[0] + umulh $21,$21,$2 ###### + stq $1,8($16) # r[1] + mulq $27,$27,$7 ###### + stq $6,16($16) # r[0] + umulh $27,$27,$3 ###### + stq $2,24($16) # r[1] + mulq $28,$28,$8 ###### + stq $7,32($16) # r[0] + umulh $28,$28,$4 ###### + stq $3,40($16) # r[1] + + addq $16,64,$16 + addq $17,32,$17 + stq $8,-16($16) # r[0] + stq $4,-8($16) # r[1] + + blt $18,$543 + ldq $20,0($17) # 1 1 + br $542 + +$442: + ldq $20,0($17) # a[0] + mulq $20,$20,$5 # a[0]*w low part r2 + addq $16,16,$16 + addq $17,8,$17 + subq $18,1,$18 + umulh $20,$20,$1 # a[0]*w high part r3 + stq $5,-16($16) # r[0] + stq $1,-8($16) # r[1] + + bgt $18,$442 + ret $31,($26),1 # else exit + + .align 4 +$543: + addq $18,4,$18 + bgt $18,$442 # goto tail code + ret $31,($26),1 # else exit + .end bn_sqr_words + + .align 3 + .globl bn_add_words + .ent bn_add_words +bn_add_words: +bn_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19,4,$19 + bis $31,$31,$0 # carry = 0 + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + .align 3 +$901: + addq $1,$5,$1 # r=a+b; + ldq $6,8($17) # a[1] + cmpult $1,$5,$22 # did we overflow? + ldq $2,8($18) # b[1] + addq $1,$0,$1 # c+= overflow + ldq $7,16($17) # a[2] + cmpult $1,$0,$0 # overflow? + ldq $3,16($18) # b[2] + addq $0,$22,$0 + ldq $8,24($17) # a[3] + addq $2,$6,$2 # r=a+b; + ldq $4,24($18) # b[3] + cmpult $2,$6,$23 # did we overflow? + addq $3,$7,$3 # r=a+b; + addq $2,$0,$2 # c+= overflow + cmpult $3,$7,$24 # did we overflow? + cmpult $2,$0,$0 # overflow? + addq $4,$8,$4 # r=a+b; + addq $0,$23,$0 + cmpult $4,$8,$25 # did we overflow? + addq $3,$0,$3 # c+= overflow + stq $1,0($16) # r[0]=c + cmpult $3,$0,$0 # overflow? + stq $2,8($16) # r[1]=c + addq $0,$24,$0 + stq $3,16($16) # r[2]=c + addq $4,$0,$4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $4,$0,$0 # overflow? + addq $17,32,$17 # a++ + addq $0,$25,$0 + stq $4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ + + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + br $901 + .align 4 +$945: + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + addq $1,$5,$1 # r=a+b; + subq $19,1,$19 # loop-- + addq $1,$0,$1 # c+= overflow + addq $17,8,$17 # a++ + cmpult $1,$5,$22 # did we overflow? + cmpult $1,$0,$0 # overflow? + addq $18,8,$18 # b++ + stq $1,0($16) # r[0]=c + addq $0,$22,$0 + addq $16,8,$16 # r++ + + bgt $19,$945 + ret $31,($26),1 # else exit + +$900: + addq $19,4,$19 + bgt $19,$945 # goto tail code + ret $31,($26),1 # else exit + .end bn_add_words + + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$119 + lda $0,-1 + br $31,$136 + .align 4 +$119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$126 + zapnot $7,15,$27 + br $31,$127 + .align 4 +$126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$127: + srl $10,32,$4 + .align 5 +$128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$129 + subq $27,1,$27 + br $31,$128 + .align 4 +$129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$134 + addq $9,$11,$9 + subq $27,1,$27 +$134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$123 + .align 4 +$124: + bis $13,$27,$0 +$136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 + + .set noat + .text + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) +$101: + ldq $3, 8($17) + cmpult $1, $2, $4 + ldq $5, 8($18) + subq $1, $2, $1 + ldq $6, 16($17) + cmpult $1, $0, $2 + ldq $7, 16($18) + subq $1, $0, $23 + ldq $8, 24($17) + addq $2, $4, $0 + cmpult $3, $5, $24 + subq $3, $5, $3 + ldq $22, 24($18) + cmpult $3, $0, $5 + subq $3, $0, $25 + addq $5, $24, $0 + cmpult $6, $7, $27 + subq $6, $7, $6 + stq $23, 0($16) + cmpult $6, $0, $7 + subq $6, $0, $28 + addq $7, $27, $0 + cmpult $8, $22, $21 + subq $8, $22, $8 + stq $25, 8($16) + cmpult $8, $0, $22 + subq $8, $0, $20 + addq $22, $21, $0 + stq $28, 16($16) + subq $19, 4, $19 + stq $20, 24($16) + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) + br $101 +$102: + ldq $1, 0($17) + ldq $2, 0($18) + cmpult $1, $2, $27 + subq $1, $2, $1 + cmpult $1, $0, $2 + subq $1, $0, $1 + stq $1, 0($16) + addq $2, $27, $0 + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $102 + ret $31,($26),1 +$100: + addq $19, 4, $19 + bgt $19, $102 +$103: + ret $31,($26),1 + .end bn_sub_words diff --git a/crypto/bn/asm/alpha.works/add.pl b/crypto/bn/asm/alpha.works/add.pl new file mode 100644 index 0000000000..4dc76e6b69 --- /dev/null +++ b/crypto/bn/asm/alpha.works/add.pl @@ -0,0 +1,119 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_add_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + ($t0,$o0)=&NR(2); + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); # will we borrow? + &add($o0,$cc,$o0); # will we borrow? + &cmpult($o0,$cc,$cc); # will we borrow? + &add($cc,$t0,$cc); # add the borrows + &st($o0,&QWPw(0,$rp)); # save + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($o0,$t0,$a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/div.pl b/crypto/bn/asm/alpha.works/div.pl new file mode 100644 index 0000000000..7ec144377f --- /dev/null +++ b/crypto/bn/asm/alpha.works/div.pl @@ -0,0 +1,144 @@ +#!/usr/local/bin/perl + +sub bn_div64 + { + local($data)=<<'EOF'; + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .set noreorder + .set volatile + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 +EOF + &asm_add($data); + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul.pl b/crypto/bn/asm/alpha.works/mul.pl new file mode 100644 index 0000000000..b182bae452 --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul.pl @@ -0,0 +1,116 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$word,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + &add($l0,$cc,$l0); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &cmpult($l0,$cc,$cc); + &st($l0,&QWPw(-1,$rp)); &FR($l0); + &add($h0,$cc,$cc); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_add.pl b/crypto/bn/asm/alpha.works/mul_add.pl new file mode 100644 index 0000000000..e37f6315fb --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_add.pl @@ -0,0 +1,120 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_add_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b + &mul($a0,$word,($l0)=&NR(1)); + &sub($count,1,$count); + &add($ap,$QWS,$ap); + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + &add($r0,$l0,$r0); + &add($rp,$QWS,$rp); + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &st($r0,&QWPw(-1,$rp)); &FR($r0); + &add($h0,$cc,$cc); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c4.pl b/crypto/bn/asm/alpha.works/mul_c4.pl new file mode 100644 index 0000000000..5efd201281 --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c4.pl @@ -0,0 +1,213 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &mul($a[0],$b[0],($r00)=&NR(1)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &muh($a[0],$b[0],($r01)=&NR(1)); + &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &mul($a[0],$b[1],($r02)=&NR(1)); + + ($R,$H1,$H2)=&NR(3); + + &st($r00,&QWPw(0,$rp)); &FR($r00); + + &mov("zero",$R); + &mul($a[1],$b[0],($r03)=&NR(1)); + + &mov("zero",$H1); + &mov("zero",$H0); + &add($R,$r01,$R); + &muh($a[0],$b[1],($r04)=&NR(1)); + &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01); + &add($R,$r02,$R); + &add($H1,$t01,$H1) &FR($t01); + &muh($a[1],$b[0],($r05)=&NR(1)); + &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02); + &add($R,$r03,$R); + &add($H2,$t02,$H2) &FR($t02); + &mul($a[0],$b[2],($r06)=&NR(1)); + &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03); + &add($H1,$t03,$H1) &FR($t03); + &st($R,&QWPw(1,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r04,$R); + &mov("zero",$H2); + &mul($a[1],$b[1],($r07)=&NR(1)); + &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04); + &add($R,$r05,$R); + &add($H1,$t04,$H1) &FR($t04); + &mul($a[2],$b[0],($r08)=&NR(1)); + &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05); + &add($R,$r01,$R); + &add($H2,$t05,$H2) &FR($t05); + &muh($a[0],$b[2],($r09)=&NR(1)); + &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06); + &add($R,$r07,$R); + &add($H1,$t06,$H1) &FR($t06); + &muh($a[1],$b[1],($r10)=&NR(1)); + &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07); + &add($R,$r08,$R); + &add($H2,$t07,$H2) &FR($t07); + &muh($a[2],$b[0],($r11)=&NR(1)); + &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08); + &add($H1,$t08,$H1) &FR($t08); + &st($R,&QWPw(2,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r09,$R); + &mov("zero",$H2); + &mul($a[0],$b[3],($r12)=&NR(1)); + &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09); + &add($R,$r10,$R); + &add($H1,$t09,$H1) &FR($t09); + &mul($a[1],$b[2],($r13)=&NR(1)); + &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10); + &add($R,$r11,$R); + &add($H1,$t10,$H1) &FR($t10); + &mul($a[2],$b[1],($r14)=&NR(1)); + &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11); + &add($R,$r12,$R); + &add($H1,$t11,$H1) &FR($t11); + &mul($a[3],$b[0],($r15)=&NR(1)); + &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12); + &add($R,$r13,$R); + &add($H1,$t12,$H1) &FR($t12); + &muh($a[0],$b[3],($r16)=&NR(1)); + &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13); + &add($R,$r14,$R); + &add($H1,$t13,$H1) &FR($t13); + &muh($a[1],$b[2],($r17)=&NR(1)); + &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14); + &add($R,$r15,$R); + &add($H1,$t14,$H1) &FR($t14); + &muh($a[2],$b[1],($r18)=&NR(1)); + &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15); + &add($H1,$t15,$H1) &FR($t15); + &st($R,&QWPw(3,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r16,$R); + &mov("zero",$H2); + &muh($a[3],$b[0],($r19)=&NR(1)); + &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16); + &add($R,$r17,$R); + &add($H1,$t16,$H1) &FR($t16); + &mul($a[1],$b[3],($r20)=&NR(1)); + &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17); + &add($R,$r18,$R); + &add($H1,$t17,$H1) &FR($t17); + &mul($a[2],$b[2],($r21)=&NR(1)); + &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18); + &add($R,$r19,$R); + &add($H1,$t18,$H1) &FR($t18); + &mul($a[3],$b[1],($r22)=&NR(1)); + &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19); + &add($R,$r20,$R); + &add($H1,$t19,$H1) &FR($t19); + &muh($a[1],$b[3],($r23)=&NR(1)); + &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20); + &add($R,$r21,$R); + &add($H1,$t20,$H1) &FR($t20); + &muh($a[2],$b[2],($r24)=&NR(1)); + &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21); + &add($R,$r22,$R); + &add($H1,$t21,$H1) &FR($t21); + &muh($a[3],$b[1],($r25)=&NR(1)); + &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22); + &add($H1,$t22,$H1) &FR($t22); + &st($R,&QWPw(4,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r23,$R); + &mov("zero",$H2); + &mul($a[2],$b[3],($r26)=&NR(1)); + &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23); + &add($R,$r24,$R); + &add($H1,$t23,$H1) &FR($t23); + &mul($a[3],$b[2],($r27)=&NR(1)); + &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24); + &add($R,$r25,$R); + &add($H1,$t24,$H1) &FR($t24); + &muh($a[2],$b[3],($r28)=&NR(1)); + &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25); + &add($R,$r26,$R); + &add($H1,$t25,$H1) &FR($t25); + &muh($a[3],$b[2],($r29)=&NR(1)); + &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26); + &add($R,$r27,$R); + &add($H1,$t26,$H1) &FR($t26); + &mul($a[3],$b[3],($r30)=&NR(1)); + &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27); + &add($H1,$t27,$H1) &FR($t27); + &st($R,&QWPw(5,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r28,$R); + &mov("zero",$H2); + &muh($a[3],$b[3],($r31)=&NR(1)); + &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28); + &add($R,$r29,$R); + &add($H1,$t28,$H1) &FR($t28); + ############ + &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29); + &add($R,$r30,$R); + &add($H1,$t29,$H1) &FR($t29); + ############ + &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30); + &add($H1,$t30,$H1) &FR($t30); + &st($R,&QWPw(6,$rp)); + &add($H1,$H2,$R); + + &add($R,$r31,$R); &FR($r31); + &st($R,&QWPw(7,$rp)); + + &FR($R,$H1,$H2); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c4.works.pl b/crypto/bn/asm/alpha.works/mul_c4.works.pl new file mode 100644 index 0000000000..79d86dd25c --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c4.works.pl @@ -0,0 +1,98 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + +print STDERR "count=$cnt\n"; $cnt++; + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &FR($c0,$c1,$c2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c8.pl b/crypto/bn/asm/alpha.works/mul_c8.pl new file mode 100644 index 0000000000..525ca7494b --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c8.pl @@ -0,0 +1,177 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &stack_push(2); + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &st($reg_s0,&swtmp(0)); &FR($reg_s0); + &st($reg_s1,&swtmp(1)); &FR($reg_s1); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &ld(($a[4])=&NR(1),&QWPw(1,$ap)); + &ld(($b[4])=&NR(1),&QWPw(1,$bp)); + &ld(($a[5])=&NR(1),&QWPw(1,$ap)); + &ld(($b[5])=&NR(1),&QWPw(1,$bp)); + &ld(($a[6])=&NR(1),&QWPw(1,$ap)); + &ld(($b[6])=&NR(1),&QWPw(1,$bp)); + &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap); + &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[4],$c0,$c1,$c2); + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); + &mul_add_c($a[4],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[5],$c0,$c1,$c2); + &mul_add_c($a[1],$b[4],$c0,$c1,$c2); + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); + &mul_add_c($a[4],$b[1],$c0,$c1,$c2); + &mul_add_c($a[5],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[6],$c0,$c1,$c2); + &mul_add_c($a[1],$b[5],$c0,$c1,$c2); + &mul_add_c($a[2],$b[4],$c0,$c1,$c2); + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); + &mul_add_c($a[4],$b[2],$c0,$c1,$c2); + &mul_add_c($a[5],$b[1],$c0,$c1,$c2); + &mul_add_c($a[6],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[6],$c0,$c1,$c2); + &mul_add_c($a[2],$b[5],$c0,$c1,$c2); + &mul_add_c($a[3],$b[4],$c0,$c1,$c2); + &mul_add_c($a[4],$b[3],$c0,$c1,$c2); + &mul_add_c($a[5],$b[2],$c0,$c1,$c2); + &mul_add_c($a[6],$b[1],$c0,$c1,$c2); + &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[6],$c0,$c1,$c2); + &mul_add_c($a[3],$b[5],$c0,$c1,$c2); + &mul_add_c($a[4],$b[4],$c0,$c1,$c2); + &mul_add_c($a[5],$b[3],$c0,$c1,$c2); + &mul_add_c($a[6],$b[2],$c0,$c1,$c2); + &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[6],$c0,$c1,$c2); + &mul_add_c($a[4],$b[5],$c0,$c1,$c2); + &mul_add_c($a[5],$b[4],$c0,$c1,$c2); + &mul_add_c($a[6],$b[3],$c0,$c1,$c2); + &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]); + &mul_add_c($a[4],$b[6],$c0,$c1,$c2); + &mul_add_c($a[5],$b[5],$c0,$c1,$c2); + &mul_add_c($a[6],$b[4],$c0,$c1,$c2); + &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]); + &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]); + &mul_add_c($a[5],$b[6],$c0,$c1,$c2); + &mul_add_c($a[6],$b[5],$c0,$c1,$c2); + &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]); + &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]); + &mul_add_c($a[6],$b[6],$c0,$c1,$c2); + &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]); + &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]); + &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]); + &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &FR($c0,$c1,$c2); + + &ld($reg_s0,&swtmp(0)); + &ld($reg_s1,&swtmp(1)); + &stack_pop(2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr.pl b/crypto/bn/asm/alpha.works/sqr.pl new file mode 100644 index 0000000000..a55b696906 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr.pl @@ -0,0 +1,113 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(3); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$a0,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &add($rp,2*$QWS,$rp); + &sub($count,1,$count); + &muh($a0,$a0,($h0)=&NR(1)); &FR($a0); + &st($l0,&QWPw(-2,$rp)); &FR($l0); + &st($h0,&QWPw(-1,$rp)); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr_c4.pl b/crypto/bn/asm/alpha.works/sqr_c4.pl new file mode 100644 index 0000000000..bf33f5b503 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr_c4.pl @@ -0,0 +1,109 @@ +#!/usr/local/bin/perl +# alpha assember + +sub sqr_add_c + { + local($a,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$a,($l1)=&NR(1)); + &muh($a,$a,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c1,$t1,$c1); &FR($t1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub sqr_add_c2 + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &cmplt($l1,"zero",($lc1)=&NR(1)); + &cmplt($h1,"zero",($hc1)=&NR(1)); + &add($l1,$l1,$l1); + &add($h1,$h1,$h1); + &add($h1,$lc1,$h1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($lc1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($hc1)=&NR(1)); &FR($h1); + + &add($c1,$lc1,$c1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + } + + +sub bn_sqr_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[0],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr_c8.pl b/crypto/bn/asm/alpha.works/sqr_c8.pl new file mode 100644 index 0000000000..b4afe085f1 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr_c8.pl @@ -0,0 +1,132 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($a[4])=&NR(1),&QWPw(4,$ap)); + &ld(($a[5])=&NR(1),&QWPw(5,$ap)); + &ld(($a[6])=&NR(1),&QWPw(6,$ap)); + &ld(($a[7])=&NR(1),&QWPw(7,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[1],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[4],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(7,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[4],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(8,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[5],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(9,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[5],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[3],$c0,$c1,$c2); + &st($c0,&QWPw(10,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[6],$a[5],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[4],$c0,$c1,$c2); + &st($c0,&QWPw(11,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[6],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[5],$c0,$c1,$c2); + &st($c0,&QWPw(12,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[7],$a[6],$c0,$c1,$c2); + &st($c0,&QWPw(13,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[7],$c0,$c1,$c2); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sub.pl b/crypto/bn/asm/alpha.works/sub.pl new file mode 100644 index 0000000000..d998da5c21 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sub.pl @@ -0,0 +1,108 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sub_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1,$tmp,$b1,$a2,$b2,$a3,$b3,$o0)=&NR(8); + &ld($a1,&QWPw(1,$ap)); + &cmpult($a0,$b0,$tmp); # will we borrow? + &ld($b1,&QWPw(1,$bp)); + &sub($a0,$b0,$a0); # do the subtract + &ld($a2,&QWPw(2,$ap)); + &cmpult($a0,$cc,$b0); # will we borrow? + &ld($b2,&QWPw(2,$bp)); + &sub($a0,$cc,$o0); # will we borrow? + &ld($a3,&QWPw(3,$ap)); + &add($b0,$tmp,$cc); ($t1,$o1)=&NR(2); &FR($tmp); + + &cmpult($a1,$b1,$t1); # will we borrow? + &sub($a1,$b1,$a1); # do the subtract + &ld($b3,&QWPw(3,$bp)); + &cmpult($a1,$cc,$b1); # will we borrow? + &sub($a1,$cc,$o1); # will we borrow? + &add($b1,$t1,$cc); ($tmp,$o2)=&NR(2); &FR($t1,$a1,$b1); + + &cmpult($a2,$b2,$tmp); # will we borrow? + &sub($a2,$b2,$a2); # do the subtract + &st($o0,&QWPw(0,$rp)); &FR($o0); # save + &cmpult($a2,$cc,$b2); # will we borrow? + &sub($a2,$cc,$o2); # will we borrow? + &add($b2,$tmp,$cc); ($t3,$o3)=&NR(2); &FR($tmp,$a2,$b2); + + &cmpult($a3,$b3,$t3); # will we borrow? + &sub($a3,$b3,$a3); # do the subtract + &st($o1,&QWPw(1,$rp)); &FR($o1); + &cmpult($a3,$cc,$b3); # will we borrow? + &sub($a3,$cc,$o3); # will we borrow? + &add($b3,$t3,$cc); &FR($t3,$a3,$b3); + + &st($o2,&QWPw(2,$rp)); &FR($o2); + &sub($count,4,$count); # count-=4 + &st($o3,&QWPw(3,$rp)); &FR($o3); + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &cmpult($a0,$b0,$tmp); # will we borrow? + &sub($a0,$b0,$a0); # do the subtract + &cmpult($a0,$cc,$b0); # will we borrow? + &sub($a0,$cc,$a0); # will we borrow? + &st($a0,&QWPw(0,$rp)); # save + &add($b0,$tmp,$cc); # add the borrows + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/add.pl b/crypto/bn/asm/alpha/add.pl new file mode 100644 index 0000000000..13bf516428 --- /dev/null +++ b/crypto/bn/asm/alpha/add.pl @@ -0,0 +1,118 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_add_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + +########################################################## + &set_label("loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); + &ld(($b0)=&NR(1),&QWPw(0,$bp)); + &ld(($a1)=&NR(1),&QWPw(1,$ap)); + &ld(($b1)=&NR(1),&QWPw(1,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &ld(($a2)=&NR(1),&QWPw(2,$ap)); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &ld(($b2)=&NR(1),&QWPw(2,$bp)); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &ld(($a3)=&NR(1),&QWPw(3,$ap)); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &ld(($b3)=&NR(1),&QWPw(3,$bp)); + &st($o0,&QWPw(0,$rp)); &FR($o0); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &st($o1,&QWPw(0,$rp)); &FR($o1); + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &cmpult($o3,$cc,$cc); + &st($o3,&QWPw(0,$rp)); &FR($o3); + &add($cc,$t3,$cc); &FR($t3); + + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + ### + &bge($count,&label("loop")); + ### + &br(&label("finish")); +################################################## + # Do the last 0..3 words + + ($t0,$o0)=&NR(2); + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($a0,$b0,$o0); + &sub($count,1,$count); + &cmpult($o0,$b0,$t0); # will we borrow? + &add($o0,$cc,$o0); # will we borrow? + &cmpult($o0,$cc,$cc); # will we borrow? + &add($rp,$QWS,$rp); + &st($o0,&QWPw(-1,$rp)); # save + &add($cc,$t0,$cc); # add the borrows + + ### + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($o0,$t0,$a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/div.pl b/crypto/bn/asm/alpha/div.pl new file mode 100644 index 0000000000..e9e680897a --- /dev/null +++ b/crypto/bn/asm/alpha/div.pl @@ -0,0 +1,144 @@ +#!/usr/local/bin/perl + +sub bn_div_words + { + local($data)=<<'EOF'; + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .set noreorder + .set volatile + .align 3 + .globl bn_div_words + .ent bn_div_words +bn_div_words + ldgp $29,0($27) +bn_div_words.ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div_words +EOF + &asm_add($data); + } + +1; diff --git a/crypto/bn/asm/alpha/mul.pl b/crypto/bn/asm/alpha/mul.pl new file mode 100644 index 0000000000..76c926566c --- /dev/null +++ b/crypto/bn/asm/alpha/mul.pl @@ -0,0 +1,104 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + ### + &blt($count,&label("finish")); + + ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap)); + + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ### wait 8 + &mul($a0,$word,($l0)=&NR(1)); &FR($a0); + ### wait 8 + &muh($a1,$word,($h1)=&NR(1)); &FR($a1); + &add($l0,$cc,$l0); ### wait 8 + &mul($a1,$word,($l1)=&NR(1)); &FR($a1); + &cmpult($l0,$cc,$cc); ### wait 8 + &muh($a2,$word,($h2)=&NR(1)); &FR($a2); + &add($h0,$cc,$cc); &FR($h0); ### wait 8 + &mul($a2,$word,($l2)=&NR(1)); &FR($a2); + &add($l1,$cc,$l1); ### wait 8 + &st($l0,&QWPw(0,$rp)); &FR($l0); + &cmpult($l1,$cc,$cc); ### wait 8 + &muh($a3,$word,($h3)=&NR(1)); &FR($a3); + &add($h1,$cc,$cc); &FR($h1); + &mul($a3,$word,($l3)=&NR(1)); &FR($a3); + &add($l2,$cc,$l2); + &st($l1,&QWPw(1,$rp)); &FR($l1); + &cmpult($l2,$cc,$cc); + &add($h2,$cc,$cc); &FR($h2); + &sub($count,4,$count); # count-=4 + &st($l2,&QWPw(2,$rp)); &FR($l2); + &add($l3,$cc,$l3); + &cmpult($l3,$cc,$cc); + &add($bp,4*$QWS,$bp); # count+=4 + &add($h3,$cc,$cc); &FR($h3); + &add($ap,4*$QWS,$ap); # count+=4 + &st($l3,&QWPw(3,$rp)); &FR($l3); + &add($rp,4*$QWS,$rp); # count+=4 + ### + &blt($count,&label("finish")); + ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap)); + &br(&label("finish")); +################################################## + +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + ### + ### + ### + &muh($a0,$word,($h0)=&NR(1)); + ### Wait 8 for next mul issue + &mul($a0,$word,($l0)=&NR(1)); &FR($a0) + &add($ap,$QWS,$ap); + ### Loose 12 until result is available + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &add($l0,$cc,$l0); + ### + &st($l0,&QWPw(-1,$rp)); &FR($l0); + &cmpult($l0,$cc,$cc); + &add($h0,$cc,$cc); &FR($h0); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_add.pl b/crypto/bn/asm/alpha/mul_add.pl new file mode 100644 index 0000000000..0d6df69bc4 --- /dev/null +++ b/crypto/bn/asm/alpha/mul_add.pl @@ -0,0 +1,123 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_add_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + ### + &blt($count,&label("finish")); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + &ld(($r0)=&NR(1),&QWPw(0,$rp)); + &ld(($a1)=&NR(1),&QWPw(1,$ap)); + &muh($a0,$word,($h0)=&NR(1)); + &ld(($r1)=&NR(1),&QWPw(1,$rp)); + &ld(($a2)=&NR(1),&QWPw(2,$ap)); + ### + &mul($a0,$word,($l0)=&NR(1)); &FR($a0); + &ld(($r2)=&NR(1),&QWPw(2,$rp)); + &muh($a1,$word,($h1)=&NR(1)); + &ld(($a3)=&NR(1),&QWPw(3,$ap)); + &mul($a1,$word,($l1)=&NR(1)); &FR($a1); + &ld(($r3)=&NR(1),&QWPw(3,$rp)); + &add($r0,$l0,$r0); + &add($r1,$l1,$r1); + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &cmpult($r1,$l1,($t1)=&NR(1)); &FR($l1); + &muh($a2,$word,($h2)=&NR(1)); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &add($h1,$t1,$h1); &FR($t1); + &add($h0,$cc,$cc); &FR($h0); + &mul($a2,$word,($l2)=&NR(1)); &FR($a2); + &add($r1,$cc,$r1); + &cmpult($r1,$cc,$cc); + &add($r2,$l2,$r2); + &add($h1,$cc,$cc); &FR($h1); + &cmpult($r2,$l2,($t2)=&NR(1)); &FR($l2); + &muh($a3,$word,($h3)=&NR(1)); + &add($r2,$cc,$r2); + &st($r0,&QWPw(0,$rp)); &FR($r0); + &add($h2,$t2,$h2); &FR($t2); + &st($r1,&QWPw(1,$rp)); &FR($r1); + &cmpult($r2,$cc,$cc); + &mul($a3,$word,($l3)=&NR(1)); &FR($a3); + &add($h2,$cc,$cc); &FR($h2); + &st($r2,&QWPw(2,$rp)); &FR($r2); + &sub($count,4,$count); # count-=4 + &add($rp,4*$QWS,$rp); # count+=4 + &add($r3,$l3,$r3); + &add($ap,4*$QWS,$ap); # count+=4 + &cmpult($r3,$l3,($t3)=&NR(1)); &FR($l3); + &add($r3,$cc,$r3); + &add($h3,$t3,$h3); &FR($t3); + &cmpult($r3,$cc,$cc); + &st($r3,&QWPw(-1,$rp)); &FR($r3); + &add($h3,$cc,$cc); &FR($h3); + + ### + &blt($count,&label("finish")); + &ld(($a0)=&NR(1),&QWPw(0,$ap)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b + ### + ### + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + ### wait 8 + &mul($a0,$word,($l0)=&NR(1)); &FR($a0); + &add($rp,$QWS,$rp); + &add($ap,$QWS,$ap); + &sub($count,1,$count); + ### wait 3 until l0 is available + &add($r0,$l0,$r0); + ### + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &add($h0,$cc,$cc); &FR($h0); + + &st($r0,&QWPw(-1,$rp)); &FR($r0); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_c4.pl b/crypto/bn/asm/alpha/mul_c4.pl new file mode 100644 index 0000000000..9cc876ded4 --- /dev/null +++ b/crypto/bn/asm/alpha/mul_c4.pl @@ -0,0 +1,215 @@ +#!/usr/local/bin/perl +# alpha assember + +# upto + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &mul($a[0],$b[0],($r00)=&NR(1)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &muh($a[0],$b[0],($r01)=&NR(1)); + &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &mul($a[0],$b[1],($r02)=&NR(1)); + + ($R,$H1,$H2)=&NR(3); + + &st($r00,&QWPw(0,$rp)); &FR($r00); + + &mov("zero",$R); + &mul($a[1],$b[0],($r03)=&NR(1)); + + &mov("zero",$H1); + &mov("zero",$H0); + &add($R,$r01,$R); + &muh($a[0],$b[1],($r04)=&NR(1)); + &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01); + &add($R,$r02,$R); + &add($H1,$t01,$H1) &FR($t01); + &muh($a[1],$b[0],($r05)=&NR(1)); + &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02); + &add($R,$r03,$R); + &add($H2,$t02,$H2) &FR($t02); + &mul($a[0],$b[2],($r06)=&NR(1)); + &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03); + &add($H1,$t03,$H1) &FR($t03); + &st($R,&QWPw(1,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r04,$R); + &mov("zero",$H2); + &mul($a[1],$b[1],($r07)=&NR(1)); + &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04); + &add($R,$r05,$R); + &add($H1,$t04,$H1) &FR($t04); + &mul($a[2],$b[0],($r08)=&NR(1)); + &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05); + &add($R,$r01,$R); + &add($H2,$t05,$H2) &FR($t05); + &muh($a[0],$b[2],($r09)=&NR(1)); + &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06); + &add($R,$r07,$R); + &add($H1,$t06,$H1) &FR($t06); + &muh($a[1],$b[1],($r10)=&NR(1)); + &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07); + &add($R,$r08,$R); + &add($H2,$t07,$H2) &FR($t07); + &muh($a[2],$b[0],($r11)=&NR(1)); + &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08); + &add($H1,$t08,$H1) &FR($t08); + &st($R,&QWPw(2,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r09,$R); + &mov("zero",$H2); + &mul($a[0],$b[3],($r12)=&NR(1)); + &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09); + &add($R,$r10,$R); + &add($H1,$t09,$H1) &FR($t09); + &mul($a[1],$b[2],($r13)=&NR(1)); + &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10); + &add($R,$r11,$R); + &add($H1,$t10,$H1) &FR($t10); + &mul($a[2],$b[1],($r14)=&NR(1)); + &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11); + &add($R,$r12,$R); + &add($H1,$t11,$H1) &FR($t11); + &mul($a[3],$b[0],($r15)=&NR(1)); + &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12); + &add($R,$r13,$R); + &add($H1,$t12,$H1) &FR($t12); + &muh($a[0],$b[3],($r16)=&NR(1)); + &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13); + &add($R,$r14,$R); + &add($H1,$t13,$H1) &FR($t13); + &muh($a[1],$b[2],($r17)=&NR(1)); + &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14); + &add($R,$r15,$R); + &add($H1,$t14,$H1) &FR($t14); + &muh($a[2],$b[1],($r18)=&NR(1)); + &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15); + &add($H1,$t15,$H1) &FR($t15); + &st($R,&QWPw(3,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r16,$R); + &mov("zero",$H2); + &muh($a[3],$b[0],($r19)=&NR(1)); + &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16); + &add($R,$r17,$R); + &add($H1,$t16,$H1) &FR($t16); + &mul($a[1],$b[3],($r20)=&NR(1)); + &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17); + &add($R,$r18,$R); + &add($H1,$t17,$H1) &FR($t17); + &mul($a[2],$b[2],($r21)=&NR(1)); + &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18); + &add($R,$r19,$R); + &add($H1,$t18,$H1) &FR($t18); + &mul($a[3],$b[1],($r22)=&NR(1)); + &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19); + &add($R,$r20,$R); + &add($H1,$t19,$H1) &FR($t19); + &muh($a[1],$b[3],($r23)=&NR(1)); + &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20); + &add($R,$r21,$R); + &add($H1,$t20,$H1) &FR($t20); + &muh($a[2],$b[2],($r24)=&NR(1)); + &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21); + &add($R,$r22,$R); + &add($H1,$t21,$H1) &FR($t21); + &muh($a[3],$b[1],($r25)=&NR(1)); + &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22); + &add($H1,$t22,$H1) &FR($t22); + &st($R,&QWPw(4,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r23,$R); + &mov("zero",$H2); + &mul($a[2],$b[3],($r26)=&NR(1)); + &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23); + &add($R,$r24,$R); + &add($H1,$t23,$H1) &FR($t23); + &mul($a[3],$b[2],($r27)=&NR(1)); + &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24); + &add($R,$r25,$R); + &add($H1,$t24,$H1) &FR($t24); + &muh($a[2],$b[3],($r28)=&NR(1)); + &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25); + &add($R,$r26,$R); + &add($H1,$t25,$H1) &FR($t25); + &muh($a[3],$b[2],($r29)=&NR(1)); + &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26); + &add($R,$r27,$R); + &add($H1,$t26,$H1) &FR($t26); + &mul($a[3],$b[3],($r30)=&NR(1)); + &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27); + &add($H1,$t27,$H1) &FR($t27); + &st($R,&QWPw(5,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r28,$R); + &mov("zero",$H2); + &muh($a[3],$b[3],($r31)=&NR(1)); + &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28); + &add($R,$r29,$R); + &add($H1,$t28,$H1) &FR($t28); + ############ + &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29); + &add($R,$r30,$R); + &add($H1,$t29,$H1) &FR($t29); + ############ + &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30); + &add($H1,$t30,$H1) &FR($t30); + &st($R,&QWPw(6,$rp)); + &add($H1,$H2,$R); + + &add($R,$r31,$R); &FR($r31); + &st($R,&QWPw(7,$rp)); + + &FR($R,$H1,$H2); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_c4.works.pl b/crypto/bn/asm/alpha/mul_c4.works.pl new file mode 100644 index 0000000000..79d86dd25c --- /dev/null +++ b/crypto/bn/asm/alpha/mul_c4.works.pl @@ -0,0 +1,98 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + +print STDERR "count=$cnt\n"; $cnt++; + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &FR($c0,$c1,$c2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_c8.pl b/crypto/bn/asm/alpha/mul_c8.pl new file mode 100644 index 0000000000..525ca7494b --- /dev/null +++ b/crypto/bn/asm/alpha/mul_c8.pl @@ -0,0 +1,177 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &stack_push(2); + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &st($reg_s0,&swtmp(0)); &FR($reg_s0); + &st($reg_s1,&swtmp(1)); &FR($reg_s1); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &ld(($a[4])=&NR(1),&QWPw(1,$ap)); + &ld(($b[4])=&NR(1),&QWPw(1,$bp)); + &ld(($a[5])=&NR(1),&QWPw(1,$ap)); + &ld(($b[5])=&NR(1),&QWPw(1,$bp)); + &ld(($a[6])=&NR(1),&QWPw(1,$ap)); + &ld(($b[6])=&NR(1),&QWPw(1,$bp)); + &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap); + &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[4],$c0,$c1,$c2); + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); + &mul_add_c($a[4],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[5],$c0,$c1,$c2); + &mul_add_c($a[1],$b[4],$c0,$c1,$c2); + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); + &mul_add_c($a[4],$b[1],$c0,$c1,$c2); + &mul_add_c($a[5],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[6],$c0,$c1,$c2); + &mul_add_c($a[1],$b[5],$c0,$c1,$c2); + &mul_add_c($a[2],$b[4],$c0,$c1,$c2); + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); + &mul_add_c($a[4],$b[2],$c0,$c1,$c2); + &mul_add_c($a[5],$b[1],$c0,$c1,$c2); + &mul_add_c($a[6],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[6],$c0,$c1,$c2); + &mul_add_c($a[2],$b[5],$c0,$c1,$c2); + &mul_add_c($a[3],$b[4],$c0,$c1,$c2); + &mul_add_c($a[4],$b[3],$c0,$c1,$c2); + &mul_add_c($a[5],$b[2],$c0,$c1,$c2); + &mul_add_c($a[6],$b[1],$c0,$c1,$c2); + &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[6],$c0,$c1,$c2); + &mul_add_c($a[3],$b[5],$c0,$c1,$c2); + &mul_add_c($a[4],$b[4],$c0,$c1,$c2); + &mul_add_c($a[5],$b[3],$c0,$c1,$c2); + &mul_add_c($a[6],$b[2],$c0,$c1,$c2); + &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[6],$c0,$c1,$c2); + &mul_add_c($a[4],$b[5],$c0,$c1,$c2); + &mul_add_c($a[5],$b[4],$c0,$c1,$c2); + &mul_add_c($a[6],$b[3],$c0,$c1,$c2); + &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]); + &mul_add_c($a[4],$b[6],$c0,$c1,$c2); + &mul_add_c($a[5],$b[5],$c0,$c1,$c2); + &mul_add_c($a[6],$b[4],$c0,$c1,$c2); + &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]); + &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]); + &mul_add_c($a[5],$b[6],$c0,$c1,$c2); + &mul_add_c($a[6],$b[5],$c0,$c1,$c2); + &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]); + &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]); + &mul_add_c($a[6],$b[6],$c0,$c1,$c2); + &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]); + &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]); + &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]); + &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &FR($c0,$c1,$c2); + + &ld($reg_s0,&swtmp(0)); + &ld($reg_s1,&swtmp(1)); + &stack_pop(2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sqr.pl b/crypto/bn/asm/alpha/sqr.pl new file mode 100644 index 0000000000..a55b696906 --- /dev/null +++ b/crypto/bn/asm/alpha/sqr.pl @@ -0,0 +1,113 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(3); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$a0,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &add($rp,2*$QWS,$rp); + &sub($count,1,$count); + &muh($a0,$a0,($h0)=&NR(1)); &FR($a0); + &st($l0,&QWPw(-2,$rp)); &FR($l0); + &st($h0,&QWPw(-1,$rp)); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sqr_c4.pl b/crypto/bn/asm/alpha/sqr_c4.pl new file mode 100644 index 0000000000..bf33f5b503 --- /dev/null +++ b/crypto/bn/asm/alpha/sqr_c4.pl @@ -0,0 +1,109 @@ +#!/usr/local/bin/perl +# alpha assember + +sub sqr_add_c + { + local($a,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$a,($l1)=&NR(1)); + &muh($a,$a,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c1,$t1,$c1); &FR($t1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub sqr_add_c2 + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &cmplt($l1,"zero",($lc1)=&NR(1)); + &cmplt($h1,"zero",($hc1)=&NR(1)); + &add($l1,$l1,$l1); + &add($h1,$h1,$h1); + &add($h1,$lc1,$h1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($lc1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($hc1)=&NR(1)); &FR($h1); + + &add($c1,$lc1,$c1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + } + + +sub bn_sqr_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[0],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sqr_c8.pl b/crypto/bn/asm/alpha/sqr_c8.pl new file mode 100644 index 0000000000..b4afe085f1 --- /dev/null +++ b/crypto/bn/asm/alpha/sqr_c8.pl @@ -0,0 +1,132 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($a[4])=&NR(1),&QWPw(4,$ap)); + &ld(($a[5])=&NR(1),&QWPw(5,$ap)); + &ld(($a[6])=&NR(1),&QWPw(6,$ap)); + &ld(($a[7])=&NR(1),&QWPw(7,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[1],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[4],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(7,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[4],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(8,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[5],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(9,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[5],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[3],$c0,$c1,$c2); + &st($c0,&QWPw(10,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[6],$a[5],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[4],$c0,$c1,$c2); + &st($c0,&QWPw(11,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[6],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[5],$c0,$c1,$c2); + &st($c0,&QWPw(12,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[7],$a[6],$c0,$c1,$c2); + &st($c0,&QWPw(13,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[7],$c0,$c1,$c2); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sub.pl b/crypto/bn/asm/alpha/sub.pl new file mode 100644 index 0000000000..d998da5c21 --- /dev/null +++ b/crypto/bn/asm/alpha/sub.pl @@ -0,0 +1,108 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sub_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1,$tmp,$b1,$a2,$b2,$a3,$b3,$o0)=&NR(8); + &ld($a1,&QWPw(1,$ap)); + &cmpult($a0,$b0,$tmp); # will we borrow? + &ld($b1,&QWPw(1,$bp)); + &sub($a0,$b0,$a0); # do the subtract + &ld($a2,&QWPw(2,$ap)); + &cmpult($a0,$cc,$b0); # will we borrow? + &ld($b2,&QWPw(2,$bp)); + &sub($a0,$cc,$o0); # will we borrow? + &ld($a3,&QWPw(3,$ap)); + &add($b0,$tmp,$cc); ($t1,$o1)=&NR(2); &FR($tmp); + + &cmpult($a1,$b1,$t1); # will we borrow? + &sub($a1,$b1,$a1); # do the subtract + &ld($b3,&QWPw(3,$bp)); + &cmpult($a1,$cc,$b1); # will we borrow? + &sub($a1,$cc,$o1); # will we borrow? + &add($b1,$t1,$cc); ($tmp,$o2)=&NR(2); &FR($t1,$a1,$b1); + + &cmpult($a2,$b2,$tmp); # will we borrow? + &sub($a2,$b2,$a2); # do the subtract + &st($o0,&QWPw(0,$rp)); &FR($o0); # save + &cmpult($a2,$cc,$b2); # will we borrow? + &sub($a2,$cc,$o2); # will we borrow? + &add($b2,$tmp,$cc); ($t3,$o3)=&NR(2); &FR($tmp,$a2,$b2); + + &cmpult($a3,$b3,$t3); # will we borrow? + &sub($a3,$b3,$a3); # do the subtract + &st($o1,&QWPw(1,$rp)); &FR($o1); + &cmpult($a3,$cc,$b3); # will we borrow? + &sub($a3,$cc,$o3); # will we borrow? + &add($b3,$t3,$cc); &FR($t3,$a3,$b3); + + &st($o2,&QWPw(2,$rp)); &FR($o2); + &sub($count,4,$count); # count-=4 + &st($o3,&QWPw(3,$rp)); &FR($o3); + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &cmpult($a0,$b0,$tmp); # will we borrow? + &sub($a0,$b0,$a0); # do the subtract + &cmpult($a0,$cc,$b0); # will we borrow? + &sub($a0,$cc,$a0); # will we borrow? + &st($a0,&QWPw(0,$rp)); # save + &add($b0,$tmp,$cc); # add the borrows + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl index 128f0f29d6..7a03c67b5b 100644 --- a/crypto/bn/asm/bn-586.pl +++ b/crypto/bn/asm/bn-586.pl @@ -1,7 +1,4 @@ #!/usr/local/bin/perl -# - -#!/usr/local/bin/perl push(@INC,"perlasm","../../perlasm"); require "x86asm.pl"; @@ -11,8 +8,9 @@ require "x86asm.pl"; &bn_mul_add_words("bn_mul_add_words"); &bn_mul_words("bn_mul_words"); &bn_sqr_words("bn_sqr_words"); -&bn_div64("bn_div64"); +&bn_div_words("bn_div_words"); &bn_add_words("bn_add_words"); +&bn_sub_words("bn_sub_words"); &asm_finish(); @@ -228,7 +226,7 @@ sub bn_sqr_words &function_end($name); } -sub bn_div64 +sub bn_div_words { local($name)=@_; @@ -307,7 +305,79 @@ sub bn_add_words } &set_label("aw_end",0); - &mov("eax",$c); +# &mov("eax",$c); # $c is "eax" + + &function_end($name); + } + +sub bn_sub_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $a="esi"; + $b="edi"; + $c="eax"; + $r="ebx"; + $tmp1="ecx"; + $tmp2="edx"; + $num="ebp"; + + &mov($r,&wparam(0)); # get r + &mov($a,&wparam(1)); # get a + &mov($b,&wparam(2)); # get b + &mov($num,&wparam(3)); # get num + &xor($c,$c); # clear carry + &and($num,0xfffffff8); # num / 8 + + &jz(&label("aw_finish")); + + &set_label("aw_loop",0); + for ($i=0; $i<8; $i++) + { + &comment("Round $i"); + + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0)); # *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *r + } + + &comment(""); + &add($a,32); + &add($b,32); + &add($r,32); + &sub($num,8); + &jnz(&label("aw_loop")); + + &set_label("aw_finish",0); + &mov($num,&wparam(3)); # get num + &and($num,7); + &jz(&label("aw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0));# *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &dec($num) if ($i != 6); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *a + &jz(&label("aw_end")) if ($i != 6); + } + &set_label("aw_end",0); + +# &mov("eax",$c); # $c is "eax" &function_end($name); } diff --git a/crypto/bn/asm/bn-alpha.pl b/crypto/bn/asm/bn-alpha.pl new file mode 100644 index 0000000000..302edf2376 --- /dev/null +++ b/crypto/bn/asm/bn-alpha.pl @@ -0,0 +1,571 @@ +#!/usr/local/bin/perl +# I have this in perl so I can use more usefull register names and then convert +# them into alpha registers. +# + +$d=&data(); +$d =~ s/CC/0/g; +$d =~ s/R1/1/g; +$d =~ s/R2/2/g; +$d =~ s/R3/3/g; +$d =~ s/R4/4/g; +$d =~ s/L1/5/g; +$d =~ s/L2/6/g; +$d =~ s/L3/7/g; +$d =~ s/L4/8/g; +$d =~ s/O1/22/g; +$d =~ s/O2/23/g; +$d =~ s/O3/24/g; +$d =~ s/O4/25/g; +$d =~ s/A1/20/g; +$d =~ s/A2/21/g; +$d =~ s/A3/27/g; +$d =~ s/A4/28/g; +if (0){ +} + +print $d; + +sub data + { + local($data)=<<'EOF'; + + # DEC Alpha assember + # The bn_div_words is actually gcc output but the other parts are hand done. + # Thanks to tzeruch@ceddec.com for sending me the gcc output for + # bn_div_words. + # I've gone back and re-done most of routines. + # The key thing to remeber for the 164 CPU is that while a + # multiply operation takes 8 cycles, another one can only be issued + # after 4 cycles have elapsed. I've done modification to help + # improve this. Also, normally, a ld instruction will not be available + # for about 3 cycles. + .file 1 "bn_asm.c" + .set noat +gcc2_compiled.: +__gnu_compiled_c: + .text + .align 3 + .globl bn_mul_add_words + .ent bn_mul_add_words +bn_mul_add_words: +bn_mul_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$CC + blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code + ldq $A1,0($17) # 1 1 + ldq $R1,0($16) # 1 1 + .align 3 +$42: + mulq $A1,$19,$L1 # 1 2 1 ###### + ldq $A2,8($17) # 2 1 + ldq $R2,8($16) # 2 1 + umulh $A1,$19,$A1 # 1 2 ###### + ldq $A3,16($17) # 3 1 + ldq $R3,16($16) # 3 1 + mulq $A2,$19,$L2 # 2 2 1 ###### + ldq $A4,24($17) # 4 1 + addq $R1,$L1,$R1 # 1 2 2 + ldq $R4,24($16) # 4 1 + umulh $A2,$19,$A2 # 2 2 ###### + cmpult $R1,$L1,$O1 # 1 2 3 1 + addq $A1,$O1,$A1 # 1 3 1 + addq $R1,$CC,$R1 # 1 2 3 1 + mulq $A3,$19,$L3 # 3 2 1 ###### + cmpult $R1,$CC,$CC # 1 2 3 2 + addq $R2,$L2,$R2 # 2 2 2 + addq $A1,$CC,$CC # 1 3 2 + cmpult $R2,$L2,$O2 # 2 2 3 1 + addq $A2,$O2,$A2 # 2 3 1 + umulh $A3,$19,$A3 # 3 2 ###### + addq $R2,$CC,$R2 # 2 2 3 1 + cmpult $R2,$CC,$CC # 2 2 3 2 + subq $18,4,$18 + mulq $A4,$19,$L4 # 4 2 1 ###### + addq $A2,$CC,$CC # 2 3 2 + addq $R3,$L3,$R3 # 3 2 2 + addq $16,32,$16 + cmpult $R3,$L3,$O3 # 3 2 3 1 + stq $R1,-32($16) # 1 2 4 + umulh $A4,$19,$A4 # 4 2 ###### + addq $A3,$O3,$A3 # 3 3 1 + addq $R3,$CC,$R3 # 3 2 3 1 + stq $R2,-24($16) # 2 2 4 + cmpult $R3,$CC,$CC # 3 2 3 2 + stq $R3,-16($16) # 3 2 4 + addq $R4,$L4,$R4 # 4 2 2 + addq $A3,$CC,$CC # 3 3 2 + cmpult $R4,$L4,$O4 # 4 2 3 1 + addq $17,32,$17 + addq $A4,$O4,$A4 # 4 3 1 + addq $R4,$CC,$R4 # 4 2 3 1 + cmpult $R4,$CC,$CC # 4 2 3 2 + stq $R4,-8($16) # 4 2 4 + addq $A4,$CC,$CC # 4 3 2 + blt $18,$43 + + ldq $A1,0($17) # 1 1 + ldq $R1,0($16) # 1 1 + + br $42 + + .align 4 +$45: + ldq $A1,0($17) # 4 1 + ldq $R1,0($16) # 4 1 + mulq $A1,$19,$L1 # 4 2 1 + subq $18,1,$18 + addq $16,8,$16 + addq $17,8,$17 + umulh $A1,$19,$A1 # 4 2 + addq $R1,$L1,$R1 # 4 2 2 + cmpult $R1,$L1,$O1 # 4 2 3 1 + addq $A1,$O1,$A1 # 4 3 1 + addq $R1,$CC,$R1 # 4 2 3 1 + cmpult $R1,$CC,$CC # 4 2 3 2 + addq $A1,$CC,$CC # 4 3 2 + stq $R1,-8($16) # 4 2 4 + bgt $18,$45 + ret $31,($26),1 # else exit + + .align 4 +$43: + addq $18,4,$18 + bgt $18,$45 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_add_words + .align 3 + .globl bn_mul_words + .ent bn_mul_words +bn_mul_words: +bn_mul_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$CC + blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code + ldq $A1,0($17) # 1 1 + .align 3 +$142: + + mulq $A1,$19,$L1 # 1 2 1 ##### + ldq $A2,8($17) # 2 1 + ldq $A3,16($17) # 3 1 + umulh $A1,$19,$A1 # 1 2 ##### + ldq $A4,24($17) # 4 1 + mulq $A2,$19,$L2 # 2 2 1 ##### + addq $L1,$CC,$L1 # 1 2 3 1 + subq $18,4,$18 + cmpult $L1,$CC,$CC # 1 2 3 2 + umulh $A2,$19,$A2 # 2 2 ##### + addq $A1,$CC,$CC # 1 3 2 + addq $17,32,$17 + addq $L2,$CC,$L2 # 2 2 3 1 + mulq $A3,$19,$L3 # 3 2 1 ##### + cmpult $L2,$CC,$CC # 2 2 3 2 + addq $A2,$CC,$CC # 2 3 2 + addq $16,32,$16 + umulh $A3,$19,$A3 # 3 2 ##### + stq $L1,-32($16) # 1 2 4 + mulq $A4,$19,$L4 # 4 2 1 ##### + addq $L3,$CC,$L3 # 3 2 3 1 + stq $L2,-24($16) # 2 2 4 + cmpult $L3,$CC,$CC # 3 2 3 2 + umulh $A4,$19,$A4 # 4 2 ##### + addq $A3,$CC,$CC # 3 3 2 + stq $L3,-16($16) # 3 2 4 + addq $L4,$CC,$L4 # 4 2 3 1 + cmpult $L4,$CC,$CC # 4 2 3 2 + + addq $A4,$CC,$CC # 4 3 2 + + stq $L4,-8($16) # 4 2 4 + + blt $18,$143 + + ldq $A1,0($17) # 1 1 + + br $142 + + .align 4 +$145: + ldq $A1,0($17) # 4 1 + mulq $A1,$19,$L1 # 4 2 1 + subq $18,1,$18 + umulh $A1,$19,$A1 # 4 2 + addq $L1,$CC,$L1 # 4 2 3 1 + addq $16,8,$16 + cmpult $L1,$CC,$CC # 4 2 3 2 + addq $17,8,$17 + addq $A1,$CC,$CC # 4 3 2 + stq $L1,-8($16) # 4 2 4 + + bgt $18,$145 + ret $31,($26),1 # else exit + + .align 4 +$143: + addq $18,4,$18 + bgt $18,$145 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_words + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18,4,$18 + blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code + ldq $A1,0($17) # 1 1 + .align 3 +$542: + mulq $A1,$A1,$L1 ###### + ldq $A2,8($17) # 1 1 + subq $18,4 + umulh $A1,$A1,$R1 ###### + ldq $A3,16($17) # 1 1 + mulq $A2,$A2,$L2 ###### + ldq $A4,24($17) # 1 1 + stq $L1,0($16) # r[0] + umulh $A2,$A2,$R2 ###### + stq $R1,8($16) # r[1] + mulq $A3,$A3,$L3 ###### + stq $L2,16($16) # r[0] + umulh $A3,$A3,$R3 ###### + stq $R2,24($16) # r[1] + mulq $A4,$A4,$L4 ###### + stq $L3,32($16) # r[0] + umulh $A4,$A4,$R4 ###### + stq $R3,40($16) # r[1] + + addq $16,64,$16 + addq $17,32,$17 + stq $L4,-16($16) # r[0] + stq $R4,-8($16) # r[1] + + blt $18,$543 + ldq $A1,0($17) # 1 1 + br $542 + +$442: + ldq $A1,0($17) # a[0] + mulq $A1,$A1,$L1 # a[0]*w low part r2 + addq $16,16,$16 + addq $17,8,$17 + subq $18,1,$18 + umulh $A1,$A1,$R1 # a[0]*w high part r3 + stq $L1,-16($16) # r[0] + stq $R1,-8($16) # r[1] + + bgt $18,$442 + ret $31,($26),1 # else exit + + .align 4 +$543: + addq $18,4,$18 + bgt $18,$442 # goto tail code + ret $31,($26),1 # else exit + .end bn_sqr_words + + .align 3 + .globl bn_add_words + .ent bn_add_words +bn_add_words: +bn_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19,4,$19 + bis $31,$31,$CC # carry = 0 + blt $19,$900 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + .align 3 +$901: + addq $R1,$L1,$R1 # r=a+b; + ldq $L2,8($17) # a[1] + cmpult $R1,$L1,$O1 # did we overflow? + ldq $R2,8($18) # b[1] + addq $R1,$CC,$R1 # c+= overflow + ldq $L3,16($17) # a[2] + cmpult $R1,$CC,$CC # overflow? + ldq $R3,16($18) # b[2] + addq $CC,$O1,$CC + ldq $L4,24($17) # a[3] + addq $R2,$L2,$R2 # r=a+b; + ldq $R4,24($18) # b[3] + cmpult $R2,$L2,$O2 # did we overflow? + addq $R3,$L3,$R3 # r=a+b; + addq $R2,$CC,$R2 # c+= overflow + cmpult $R3,$L3,$O3 # did we overflow? + cmpult $R2,$CC,$CC # overflow? + addq $R4,$L4,$R4 # r=a+b; + addq $CC,$O2,$CC + cmpult $R4,$L4,$O4 # did we overflow? + addq $R3,$CC,$R3 # c+= overflow + stq $R1,0($16) # r[0]=c + cmpult $R3,$CC,$CC # overflow? + stq $R2,8($16) # r[1]=c + addq $CC,$O3,$CC + stq $R3,16($16) # r[2]=c + addq $R4,$CC,$R4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $R4,$CC,$CC # overflow? + addq $17,32,$17 # a++ + addq $CC,$O4,$CC + stq $R4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ + + blt $19,$900 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + br $901 + .align 4 +$945: + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + addq $R1,$L1,$R1 # r=a+b; + subq $19,1,$19 # loop-- + addq $R1,$CC,$R1 # c+= overflow + addq $17,8,$17 # a++ + cmpult $R1,$L1,$O1 # did we overflow? + cmpult $R1,$CC,$CC # overflow? + addq $18,8,$18 # b++ + stq $R1,0($16) # r[0]=c + addq $CC,$O1,$CC + addq $16,8,$16 # r++ + + bgt $19,$945 + ret $31,($26),1 # else exit + +$900: + addq $19,4,$19 + bgt $19,$945 # goto tail code + ret $31,($26),1 # else exit + .end bn_add_words + + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19,4,$19 + bis $31,$31,$CC # carry = 0 + br $800 + blt $19,$800 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + .align 3 +$801: + addq $R1,$L1,$R1 # r=a+b; + ldq $L2,8($17) # a[1] + cmpult $R1,$L1,$O1 # did we overflow? + ldq $R2,8($18) # b[1] + addq $R1,$CC,$R1 # c+= overflow + ldq $L3,16($17) # a[2] + cmpult $R1,$CC,$CC # overflow? + ldq $R3,16($18) # b[2] + addq $CC,$O1,$CC + ldq $L4,24($17) # a[3] + addq $R2,$L2,$R2 # r=a+b; + ldq $R4,24($18) # b[3] + cmpult $R2,$L2,$O2 # did we overflow? + addq $R3,$L3,$R3 # r=a+b; + addq $R2,$CC,$R2 # c+= overflow + cmpult $R3,$L3,$O3 # did we overflow? + cmpult $R2,$CC,$CC # overflow? + addq $R4,$L4,$R4 # r=a+b; + addq $CC,$O2,$CC + cmpult $R4,$L4,$O4 # did we overflow? + addq $R3,$CC,$R3 # c+= overflow + stq $R1,0($16) # r[0]=c + cmpult $R3,$CC,$CC # overflow? + stq $R2,8($16) # r[1]=c + addq $CC,$O3,$CC + stq $R3,16($16) # r[2]=c + addq $R4,$CC,$R4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $R4,$CC,$CC # overflow? + addq $17,32,$17 # a++ + addq $CC,$O4,$CC + stq $R4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ + + blt $19,$800 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + br $801 + .align 4 +$845: + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + cmpult $L1,$R1,$O1 # will we borrow? + subq $L1,$R1,$R1 # r=a-b; + subq $19,1,$19 # loop-- + cmpult $R1,$CC,$O2 # will we borrow? + subq $R1,$CC,$R1 # c+= overflow + addq $17,8,$17 # a++ + addq $18,8,$18 # b++ + stq $R1,0($16) # r[0]=c + addq $O2,$O1,$CC + addq $16,8,$16 # r++ + + bgt $19,$845 + ret $31,($26),1 # else exit + +$800: + addq $19,4,$19 + bgt $19,$845 # goto tail code + ret $31,($26),1 # else exit + .end bn_sub_words + + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div_words + .ent bn_div_words +bn_div_words: + ldgp $29,0($27) +bn_div_words..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$119 + lda $0,-1 + br $31,$136 + .align 4 +$119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$126 + zapnot $7,15,$27 + br $31,$127 + .align 4 +$126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$127: + srl $10,32,$4 + .align 5 +$128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$129 + subq $27,1,$27 + br $31,$128 + .align 4 +$129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$134 + addq $9,$11,$9 + subq $27,1,$27 +$134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$123 + .align 4 +$124: + bis $13,$27,$0 +$136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div_words +EOF + return($data); + } + diff --git a/crypto/bn/asm/bn-win32.asm b/crypto/bn/asm/bn-win32.asm index 017ea462b0..871bd88d77 100644 --- a/crypto/bn/asm/bn-win32.asm +++ b/crypto/bn/asm/bn-win32.asm @@ -485,9 +485,9 @@ $L010sw_end: _bn_sqr_words ENDP _TEXT ENDS _TEXT SEGMENT -PUBLIC _bn_div64 +PUBLIC _bn_div_words -_bn_div64 PROC NEAR +_bn_div_words PROC NEAR push ebp push ebx push esi @@ -501,7 +501,7 @@ _bn_div64 PROC NEAR pop ebx pop ebp ret -_bn_div64 ENDP +_bn_div_words ENDP _TEXT ENDS _TEXT SEGMENT PUBLIC _bn_add_words @@ -678,7 +678,6 @@ $L011aw_finish: adc eax, 0 mov DWORD PTR 24[ebx],ecx $L013aw_end: - mov eax, eax pop edi pop esi pop ebx @@ -686,4 +685,1438 @@ $L013aw_end: ret _bn_add_words ENDP _TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_sub_words + +_bn_sub_words PROC NEAR + push ebp + push ebx + push esi + push edi + ; + mov ebx, DWORD PTR 20[esp] + mov esi, DWORD PTR 24[esp] + mov edi, DWORD PTR 28[esp] + mov ebp, DWORD PTR 32[esp] + xor eax, eax + and ebp, 4294967288 + jz $L014aw_finish +L015aw_loop: + ; Round 0 + mov ecx, DWORD PTR [esi] + mov edx, DWORD PTR [edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR [ebx],ecx + ; Round 1 + mov ecx, DWORD PTR 4[esi] + mov edx, DWORD PTR 4[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 4[ebx],ecx + ; Round 2 + mov ecx, DWORD PTR 8[esi] + mov edx, DWORD PTR 8[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 8[ebx],ecx + ; Round 3 + mov ecx, DWORD PTR 12[esi] + mov edx, DWORD PTR 12[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 12[ebx],ecx + ; Round 4 + mov ecx, DWORD PTR 16[esi] + mov edx, DWORD PTR 16[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 16[ebx],ecx + ; Round 5 + mov ecx, DWORD PTR 20[esi] + mov edx, DWORD PTR 20[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 20[ebx],ecx + ; Round 6 + mov ecx, DWORD PTR 24[esi] + mov edx, DWORD PTR 24[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 24[ebx],ecx + ; Round 7 + mov ecx, DWORD PTR 28[esi] + mov edx, DWORD PTR 28[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 28[ebx],ecx + ; + add esi, 32 + add edi, 32 + add ebx, 32 + sub ebp, 8 + jnz L015aw_loop +$L014aw_finish: + mov ebp, DWORD PTR 32[esp] + and ebp, 7 + jz $L016aw_end + ; Tail Round 0 + mov ecx, DWORD PTR [esi] + mov edx, DWORD PTR [edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR [ebx],ecx + jz $L016aw_end + ; Tail Round 1 + mov ecx, DWORD PTR 4[esi] + mov edx, DWORD PTR 4[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 4[ebx],ecx + jz $L016aw_end + ; Tail Round 2 + mov ecx, DWORD PTR 8[esi] + mov edx, DWORD PTR 8[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 8[ebx],ecx + jz $L016aw_end + ; Tail Round 3 + mov ecx, DWORD PTR 12[esi] + mov edx, DWORD PTR 12[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 12[ebx],ecx + jz $L016aw_end + ; Tail Round 4 + mov ecx, DWORD PTR 16[esi] + mov edx, DWORD PTR 16[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 16[ebx],ecx + jz $L016aw_end + ; Tail Round 5 + mov ecx, DWORD PTR 20[esi] + mov edx, DWORD PTR 20[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 20[ebx],ecx + jz $L016aw_end + ; Tail Round 6 + mov ecx, DWORD PTR 24[esi] + mov edx, DWORD PTR 24[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 24[ebx],ecx +$L016aw_end: + pop edi + pop esi + pop ebx + pop ebp + ret +_bn_sub_words ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_mul_comba8 + +_bn_mul_comba8 PROC NEAR + push esi + mov esi, DWORD PTR 12[esp] + push edi + mov edi, DWORD PTR 20[esp] + push ebp + push ebx + xor ebx, ebx + mov eax, DWORD PTR [esi] + xor ecx, ecx + mov edx, DWORD PTR [edi] + ; ################## Calculate word 0 + xor ebp, ebp + ; mul a[0]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR [eax],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ################## Calculate word 1 + xor ebx, ebx + ; mul a[1]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[0]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR [edi] + adc ebx, 0 + mov DWORD PTR 4[eax],ecx + mov eax, DWORD PTR 8[esi] + ; saved r[1] + ; ################## Calculate word 2 + xor ecx, ecx + ; mul a[2]*b[0] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 4[edi] + adc ecx, 0 + ; mul a[1]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR [esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[0]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR [edi] + adc ecx, 0 + mov DWORD PTR 8[eax],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ################## Calculate word 3 + xor ebp, ebp + ; mul a[3]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + ; mul a[2]*b[1] + mul edx + add ebx, eax + mov eax, DWORD PTR 4[esi] + adc ecx, edx + mov edx, DWORD PTR 8[edi] + adc ebp, 0 + ; mul a[1]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR [esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[0]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR 12[eax],ebx + mov eax, DWORD PTR 16[esi] + ; saved r[3] + ; ################## Calculate word 4 + xor ebx, ebx + ; mul a[4]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR 12[esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[3]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 8[esi] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + ; mul a[2]*b[2] + mul edx + add ecx, eax + mov eax, DWORD PTR 4[esi] + adc ebp, edx + mov edx, DWORD PTR 12[edi] + adc ebx, 0 + ; mul a[1]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + ; mul a[0]*b[4] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR [edi] + adc ebx, 0 + mov DWORD PTR 16[eax],ecx + mov eax, DWORD PTR 20[esi] + ; saved r[4] + ; ################## Calculate word 5 + xor ecx, ecx + ; mul a[5]*b[0] + mul edx + add ebp, eax + mov eax, DWORD PTR 16[esi] + adc ebx, edx + mov edx, DWORD PTR 4[edi] + adc ecx, 0 + ; mul a[4]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR 12[esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[3]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 8[esi] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + ; mul a[2]*b[3] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 16[edi] + adc ecx, 0 + ; mul a[1]*b[4] + mul edx + add ebp, eax + mov eax, DWORD PTR [esi] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + ; mul a[0]*b[5] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR [edi] + adc ecx, 0 + mov DWORD PTR 20[eax],ebp + mov eax, DWORD PTR 24[esi] + ; saved r[5] + ; ################## Calculate word 6 + xor ebp, ebp + ; mul a[6]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esi] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + ; mul a[5]*b[1] + mul edx + add ebx, eax + mov eax, DWORD PTR 16[esi] + adc ecx, edx + mov edx, DWORD PTR 8[edi] + adc ebp, 0 + ; mul a[4]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR 12[esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[3]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 16[edi] + adc ebp, 0 + ; mul a[2]*b[4] + mul edx + add ebx, eax + mov eax, DWORD PTR 4[esi] + adc ecx, edx + mov edx, DWORD PTR 20[edi] + adc ebp, 0 + ; mul a[1]*b[5] + mul edx + add ebx, eax + mov eax, DWORD PTR [esi] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + ; mul a[0]*b[6] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR 24[eax],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[6] + ; ################## Calculate word 7 + xor ebx, ebx + ; mul a[7]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR 24[esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[6]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esi] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + ; mul a[5]*b[2] + mul edx + add ecx, eax + mov eax, DWORD PTR 16[esi] + adc ebp, edx + mov edx, DWORD PTR 12[edi] + adc ebx, 0 + ; mul a[4]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR 12[esi] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + ; mul a[3]*b[4] + mul edx + add ecx, eax + mov eax, DWORD PTR 8[esi] + adc ebp, edx + mov edx, DWORD PTR 20[edi] + adc ebx, 0 + ; mul a[2]*b[5] + mul edx + add ecx, eax + mov eax, DWORD PTR 4[esi] + adc ebp, edx + mov edx, DWORD PTR 24[edi] + adc ebx, 0 + ; mul a[1]*b[6] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + ; mul a[0]*b[7] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + mov DWORD PTR 28[eax],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[7] + ; ################## Calculate word 8 + xor ecx, ecx + ; mul a[7]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR 24[esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[6]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esi] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + ; mul a[5]*b[3] + mul edx + add ebp, eax + mov eax, DWORD PTR 16[esi] + adc ebx, edx + mov edx, DWORD PTR 16[edi] + adc ecx, 0 + ; mul a[4]*b[4] + mul edx + add ebp, eax + mov eax, DWORD PTR 12[esi] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + ; mul a[3]*b[5] + mul edx + add ebp, eax + mov eax, DWORD PTR 8[esi] + adc ebx, edx + mov edx, DWORD PTR 24[edi] + adc ecx, 0 + ; mul a[2]*b[6] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 28[edi] + adc ecx, 0 + ; mul a[1]*b[7] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + mov DWORD PTR 32[eax],ebp + mov eax, DWORD PTR 28[esi] + ; saved r[8] + ; ################## Calculate word 9 + xor ebp, ebp + ; mul a[7]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR 24[esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[6]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esi] + adc ecx, edx + mov edx, DWORD PTR 16[edi] + adc ebp, 0 + ; mul a[5]*b[4] + mul edx + add ebx, eax + mov eax, DWORD PTR 16[esi] + adc ecx, edx + mov edx, DWORD PTR 20[edi] + adc ebp, 0 + ; mul a[4]*b[5] + mul edx + add ebx, eax + mov eax, DWORD PTR 12[esi] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + ; mul a[3]*b[6] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 28[edi] + adc ebp, 0 + ; mul a[2]*b[7] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + mov DWORD PTR 36[eax],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[9] + ; ################## Calculate word 10 + xor ebx, ebx + ; mul a[7]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR 24[esi] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + ; mul a[6]*b[4] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esi] + adc ebp, edx + mov edx, DWORD PTR 20[edi] + adc ebx, 0 + ; mul a[5]*b[5] + mul edx + add ecx, eax + mov eax, DWORD PTR 16[esi] + adc ebp, edx + mov edx, DWORD PTR 24[edi] + adc ebx, 0 + ; mul a[4]*b[6] + mul edx + add ecx, eax + mov eax, DWORD PTR 12[esi] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + ; mul a[3]*b[7] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + mov DWORD PTR 40[eax],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[10] + ; ################## Calculate word 11 + xor ecx, ecx + ; mul a[7]*b[4] + mul edx + add ebp, eax + mov eax, DWORD PTR 24[esi] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + ; mul a[6]*b[5] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esi] + adc ebx, edx + mov edx, DWORD PTR 24[edi] + adc ecx, 0 + ; mul a[5]*b[6] + mul edx + add ebp, eax + mov eax, DWORD PTR 16[esi] + adc ebx, edx + mov edx, DWORD PTR 28[edi] + adc ecx, 0 + ; mul a[4]*b[7] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + mov DWORD PTR 44[eax],ebp + mov eax, DWORD PTR 28[esi] + ; saved r[11] + ; ################## Calculate word 12 + xor ebp, ebp + ; mul a[7]*b[5] + mul edx + add ebx, eax + mov eax, DWORD PTR 24[esi] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + ; mul a[6]*b[6] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esi] + adc ecx, edx + mov edx, DWORD PTR 28[edi] + adc ebp, 0 + ; mul a[5]*b[7] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + mov DWORD PTR 48[eax],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[12] + ; ################## Calculate word 13 + xor ebx, ebx + ; mul a[7]*b[6] + mul edx + add ecx, eax + mov eax, DWORD PTR 24[esi] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + ; mul a[6]*b[7] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + mov DWORD PTR 52[eax],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[13] + ; ################## Calculate word 14 + xor ecx, ecx + ; mul a[7]*b[7] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + adc ecx, 0 + mov DWORD PTR 56[eax],ebp + ; saved r[14] + ; save r[15] + mov DWORD PTR 60[eax],ebx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_mul_comba8 ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_mul_comba4 + +_bn_mul_comba4 PROC NEAR + push esi + mov esi, DWORD PTR 12[esp] + push edi + mov edi, DWORD PTR 20[esp] + push ebp + push ebx + xor ebx, ebx + mov eax, DWORD PTR [esi] + xor ecx, ecx + mov edx, DWORD PTR [edi] + ; ################## Calculate word 0 + xor ebp, ebp + ; mul a[0]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR [eax],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ################## Calculate word 1 + xor ebx, ebx + ; mul a[1]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[0]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR [edi] + adc ebx, 0 + mov DWORD PTR 4[eax],ecx + mov eax, DWORD PTR 8[esi] + ; saved r[1] + ; ################## Calculate word 2 + xor ecx, ecx + ; mul a[2]*b[0] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 4[edi] + adc ecx, 0 + ; mul a[1]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR [esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[0]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR [edi] + adc ecx, 0 + mov DWORD PTR 8[eax],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ################## Calculate word 3 + xor ebp, ebp + ; mul a[3]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + ; mul a[2]*b[1] + mul edx + add ebx, eax + mov eax, DWORD PTR 4[esi] + adc ecx, edx + mov edx, DWORD PTR 8[edi] + adc ebp, 0 + ; mul a[1]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR [esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[0]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + mov DWORD PTR 12[eax],ebx + mov eax, DWORD PTR 12[esi] + ; saved r[3] + ; ################## Calculate word 4 + xor ebx, ebx + ; mul a[3]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 8[esi] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + ; mul a[2]*b[2] + mul edx + add ecx, eax + mov eax, DWORD PTR 4[esi] + adc ebp, edx + mov edx, DWORD PTR 12[edi] + adc ebx, 0 + ; mul a[1]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + mov DWORD PTR 16[eax],ecx + mov eax, DWORD PTR 12[esi] + ; saved r[4] + ; ################## Calculate word 5 + xor ecx, ecx + ; mul a[3]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 8[esi] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + ; mul a[2]*b[3] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + mov DWORD PTR 20[eax],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[5] + ; ################## Calculate word 6 + xor ebp, ebp + ; mul a[3]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + adc ebp, 0 + mov DWORD PTR 24[eax],ebx + ; saved r[6] + ; save r[7] + mov DWORD PTR 28[eax],ecx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_mul_comba4 ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_sqr_comba8 + +_bn_sqr_comba8 PROC NEAR + push esi + push edi + push ebp + push ebx + mov edi, DWORD PTR 20[esp] + mov esi, DWORD PTR 24[esp] + xor ebx, ebx + xor ecx, ecx + mov eax, DWORD PTR [esi] + ; ############### Calculate word 0 + xor ebp, ebp + ; sqr a[0]*a[0] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR [esi] + adc ebp, 0 + mov DWORD PTR [edi],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ############### Calculate word 1 + xor ebx, ebx + ; sqr a[1]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + mov DWORD PTR 4[edi],ecx + mov edx, DWORD PTR [esi] + ; saved r[1] + ; ############### Calculate word 2 + xor ecx, ecx + ; sqr a[2]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 4[esi] + adc ecx, 0 + ; sqr a[1]*a[1] + mul eax + add ebp, eax + adc ebx, edx + mov edx, DWORD PTR [esi] + adc ecx, 0 + mov DWORD PTR 8[edi],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ############### Calculate word 3 + xor ebp, ebp + ; sqr a[3]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 8[esi] + adc ebp, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[2]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 16[esi] + adc ebp, 0 + mov DWORD PTR 12[edi],ebx + mov edx, DWORD PTR [esi] + ; saved r[3] + ; ############### Calculate word 4 + xor ebx, ebx + ; sqr a[4]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 12[esi] + adc ebx, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[3]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + ; sqr a[2]*a[2] + mul eax + add ecx, eax + adc ebp, edx + mov edx, DWORD PTR [esi] + adc ebx, 0 + mov DWORD PTR 16[edi],ecx + mov eax, DWORD PTR 20[esi] + ; saved r[4] + ; ############### Calculate word 5 + xor ecx, ecx + ; sqr a[5]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 16[esi] + adc ecx, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[4]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 12[esi] + adc ecx, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[3]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 24[esi] + adc ecx, 0 + mov DWORD PTR 20[edi],ebp + mov edx, DWORD PTR [esi] + ; saved r[5] + ; ############### Calculate word 6 + xor ebp, ebp + ; sqr a[6]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 20[esi] + adc ebp, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[5]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 16[esi] + adc ebp, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[4]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 12[esi] + adc ebp, 0 + ; sqr a[3]*a[3] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR [esi] + adc ebp, 0 + mov DWORD PTR 24[edi],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[6] + ; ############### Calculate word 7 + xor ebx, ebx + ; sqr a[7]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 24[esi] + adc ebx, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[6]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 20[esi] + adc ebx, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[5]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 16[esi] + adc ebx, 0 + mov edx, DWORD PTR 12[esi] + ; sqr a[4]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 28[esi] + adc ebx, 0 + mov DWORD PTR 28[edi],ecx + mov edx, DWORD PTR 4[esi] + ; saved r[7] + ; ############### Calculate word 8 + xor ecx, ecx + ; sqr a[7]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 24[esi] + adc ecx, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[6]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 20[esi] + adc ecx, 0 + mov edx, DWORD PTR 12[esi] + ; sqr a[5]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 16[esi] + adc ecx, 0 + ; sqr a[4]*a[4] + mul eax + add ebp, eax + adc ebx, edx + mov edx, DWORD PTR 8[esi] + adc ecx, 0 + mov DWORD PTR 32[edi],ebp + mov eax, DWORD PTR 28[esi] + ; saved r[8] + ; ############### Calculate word 9 + xor ebp, ebp + ; sqr a[7]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 24[esi] + adc ebp, 0 + mov edx, DWORD PTR 12[esi] + ; sqr a[6]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 20[esi] + adc ebp, 0 + mov edx, DWORD PTR 16[esi] + ; sqr a[5]*a[4] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 28[esi] + adc ebp, 0 + mov DWORD PTR 36[edi],ebx + mov edx, DWORD PTR 12[esi] + ; saved r[9] + ; ############### Calculate word 10 + xor ebx, ebx + ; sqr a[7]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 24[esi] + adc ebx, 0 + mov edx, DWORD PTR 16[esi] + ; sqr a[6]*a[4] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 20[esi] + adc ebx, 0 + ; sqr a[5]*a[5] + mul eax + add ecx, eax + adc ebp, edx + mov edx, DWORD PTR 16[esi] + adc ebx, 0 + mov DWORD PTR 40[edi],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[10] + ; ############### Calculate word 11 + xor ecx, ecx + ; sqr a[7]*a[4] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 24[esi] + adc ecx, 0 + mov edx, DWORD PTR 20[esi] + ; sqr a[6]*a[5] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 28[esi] + adc ecx, 0 + mov DWORD PTR 44[edi],ebp + mov edx, DWORD PTR 20[esi] + ; saved r[11] + ; ############### Calculate word 12 + xor ebp, ebp + ; sqr a[7]*a[5] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 24[esi] + adc ebp, 0 + ; sqr a[6]*a[6] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR 24[esi] + adc ebp, 0 + mov DWORD PTR 48[edi],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[12] + ; ############### Calculate word 13 + xor ebx, ebx + ; sqr a[7]*a[6] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 28[esi] + adc ebx, 0 + mov DWORD PTR 52[edi],ecx + ; saved r[13] + ; ############### Calculate word 14 + xor ecx, ecx + ; sqr a[7]*a[7] + mul eax + add ebp, eax + adc ebx, edx + adc ecx, 0 + mov DWORD PTR 56[edi],ebp + ; saved r[14] + mov DWORD PTR 60[edi],ebx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_sqr_comba8 ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_sqr_comba4 + +_bn_sqr_comba4 PROC NEAR + push esi + push edi + push ebp + push ebx + mov edi, DWORD PTR 20[esp] + mov esi, DWORD PTR 24[esp] + xor ebx, ebx + xor ecx, ecx + mov eax, DWORD PTR [esi] + ; ############### Calculate word 0 + xor ebp, ebp + ; sqr a[0]*a[0] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR [esi] + adc ebp, 0 + mov DWORD PTR [edi],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ############### Calculate word 1 + xor ebx, ebx + ; sqr a[1]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + mov DWORD PTR 4[edi],ecx + mov edx, DWORD PTR [esi] + ; saved r[1] + ; ############### Calculate word 2 + xor ecx, ecx + ; sqr a[2]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 4[esi] + adc ecx, 0 + ; sqr a[1]*a[1] + mul eax + add ebp, eax + adc ebx, edx + mov edx, DWORD PTR [esi] + adc ecx, 0 + mov DWORD PTR 8[edi],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ############### Calculate word 3 + xor ebp, ebp + ; sqr a[3]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 8[esi] + adc ebp, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[2]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 12[esi] + adc ebp, 0 + mov DWORD PTR 12[edi],ebx + mov edx, DWORD PTR 4[esi] + ; saved r[3] + ; ############### Calculate word 4 + xor ebx, ebx + ; sqr a[3]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + ; sqr a[2]*a[2] + mul eax + add ecx, eax + adc ebp, edx + mov edx, DWORD PTR 8[esi] + adc ebx, 0 + mov DWORD PTR 16[edi],ecx + mov eax, DWORD PTR 12[esi] + ; saved r[4] + ; ############### Calculate word 5 + xor ecx, ecx + ; sqr a[3]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 12[esi] + adc ecx, 0 + mov DWORD PTR 20[edi],ebp + ; saved r[5] + ; ############### Calculate word 6 + xor ebp, ebp + ; sqr a[3]*a[3] + mul eax + add ebx, eax + adc ecx, edx + adc ebp, 0 + mov DWORD PTR 24[edi],ebx + ; saved r[6] + mov DWORD PTR 28[edi],ecx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_sqr_comba4 ENDP +_TEXT ENDS END diff --git a/crypto/bn/asm/bn86unix.cpp b/crypto/bn/asm/bn86unix.cpp index 64702201ea..639a3ac41c 100644 --- a/crypto/bn/asm/bn86unix.cpp +++ b/crypto/bn/asm/bn86unix.cpp @@ -12,8 +12,13 @@ #define bn_mul_add_words _bn_mul_add_words #define bn_mul_words _bn_mul_words #define bn_sqr_words _bn_sqr_words -#define bn_div64 _bn_div64 +#define bn_div_words _bn_div_words #define bn_add_words _bn_add_words +#define bn_sub_words _bn_sub_words +#define bn_mul_comba8 _bn_mul_comba8 +#define bn_mul_comba4 _bn_mul_comba4 +#define bn_sqr_comba8 _bn_sqr_comba8 +#define bn_sqr_comba4 _bn_sqr_comba4 #endif @@ -544,9 +549,9 @@ bn_sqr_words: .ident "bn_sqr_words" .text .align ALIGN -.globl bn_div64 - TYPE(bn_div64,@function) -bn_div64: +.globl bn_div_words + TYPE(bn_div_words,@function) +bn_div_words: pushl %ebp pushl %ebx pushl %esi @@ -561,9 +566,9 @@ bn_div64: popl %ebx popl %ebp ret -.bn_div64_end: - SIZE(bn_div64,.bn_div64_end-bn_div64) -.ident "bn_div64" +.bn_div_words_end: + SIZE(bn_div_words,.bn_div_words_end-bn_div_words) +.ident "bn_div_words" .text .align ALIGN .globl bn_add_words @@ -741,7 +746,6 @@ bn_add_words: adcl $0, %eax movl %ecx, 24(%ebx) .L013aw_end: - movl %eax, %eax popl %edi popl %esi popl %ebx @@ -750,3 +754,1448 @@ bn_add_words: .bn_add_words_end: SIZE(bn_add_words,.bn_add_words_end-bn_add_words) .ident "bn_add_words" +.text + .align ALIGN +.globl bn_sub_words + TYPE(bn_sub_words,@function) +bn_sub_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %ebx + movl 24(%esp), %esi + movl 28(%esp), %edi + movl 32(%esp), %ebp + xorl %eax, %eax + andl $4294967288, %ebp + jz .L014aw_finish +.L015aw_loop: + /* Round 0 */ + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, (%ebx) + /* Round 1 */ + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 4(%ebx) + /* Round 2 */ + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 8(%ebx) + /* Round 3 */ + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 12(%ebx) + /* Round 4 */ + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 16(%ebx) + /* Round 5 */ + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 20(%ebx) + /* Round 6 */ + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) + /* Round 7 */ + movl 28(%esi), %ecx + movl 28(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 28(%ebx) + + addl $32, %esi + addl $32, %edi + addl $32, %ebx + subl $8, %ebp + jnz .L015aw_loop +.L014aw_finish: + movl 32(%esp), %ebp + andl $7, %ebp + jz .L016aw_end + /* Tail Round 0 */ + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, (%ebx) + jz .L016aw_end + /* Tail Round 1 */ + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 4(%ebx) + jz .L016aw_end + /* Tail Round 2 */ + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 8(%ebx) + jz .L016aw_end + /* Tail Round 3 */ + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 12(%ebx) + jz .L016aw_end + /* Tail Round 4 */ + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 16(%ebx) + jz .L016aw_end + /* Tail Round 5 */ + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 20(%ebx) + jz .L016aw_end + /* Tail Round 6 */ + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) +.L016aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_sub_words_end: + SIZE(bn_sub_words,.bn_sub_words_end-bn_sub_words) +.ident "bn_sub_words" +.text + .align ALIGN +.globl bn_mul_comba8 + TYPE(bn_mul_comba8,@function) +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[4]*b[0] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[5]*b[0] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[1] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[4] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[6]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[1] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[2] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[4] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[5] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + /* saved r[6] */ + /* ################## Calculate word 7 */ + xorl %ebx, %ebx + /* mul a[7]*b[0] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[2] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[3] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[4] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[5] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[6] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + /* saved r[7] */ + /* ################## Calculate word 8 */ + xorl %ecx, %ecx + /* mul a[7]*b[1] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[3] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[4] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[5] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[6] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + /* saved r[8] */ + /* ################## Calculate word 9 */ + xorl %ebp, %ebp + /* mul a[7]*b[2] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[4] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[5] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[6] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + /* saved r[9] */ + /* ################## Calculate word 10 */ + xorl %ebx, %ebx + /* mul a[7]*b[3] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[5] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[6] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + /* saved r[10] */ + /* ################## Calculate word 11 */ + xorl %ecx, %ecx + /* mul a[7]*b[4] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[6] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + /* saved r[11] */ + /* ################## Calculate word 12 */ + xorl %ebp, %ebp + /* mul a[7]*b[5] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + /* saved r[12] */ + /* ################## Calculate word 13 */ + xorl %ebx, %ebx + /* mul a[7]*b[6] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + /* saved r[13] */ + /* ################## Calculate word 14 */ + xorl %ecx, %ecx + /* mul a[7]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + /* saved r[14] */ + /* save r[15] */ + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + SIZE(bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_mul_comba4 + TYPE(bn_mul_comba4,@function) +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + /* saved r[6] */ + /* save r[7] */ + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + SIZE(bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba8 + TYPE(bn_sqr_comba8,@function) +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[4]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[5]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + /* sqr a[4]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[6]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[5]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + /* sqr a[4]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + /* saved r[6] */ + /* ############### Calculate word 7 */ + xorl %ebx, %ebx + /* sqr a[7]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[6]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + /* sqr a[5]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + /* sqr a[4]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + /* saved r[7] */ + /* ############### Calculate word 8 */ + xorl %ecx, %ecx + /* sqr a[7]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[6]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + /* sqr a[5]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + /* sqr a[4]*a[4] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + /* saved r[8] */ + /* ############### Calculate word 9 */ + xorl %ebp, %ebp + /* sqr a[7]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + /* sqr a[6]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + /* sqr a[5]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + /* saved r[9] */ + /* ############### Calculate word 10 */ + xorl %ebx, %ebx + /* sqr a[7]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + /* sqr a[6]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + /* sqr a[5]*a[5] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + /* saved r[10] */ + /* ############### Calculate word 11 */ + xorl %ecx, %ecx + /* sqr a[7]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + /* sqr a[6]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + /* saved r[11] */ + /* ############### Calculate word 12 */ + xorl %ebp, %ebp + /* sqr a[7]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + /* sqr a[6]*a[6] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + /* saved r[12] */ + /* ############### Calculate word 13 */ + xorl %ebx, %ebx + /* sqr a[7]*a[6] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + /* saved r[13] */ + /* ############### Calculate word 14 */ + xorl %ecx, %ecx + /* sqr a[7]*a[7] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + /* saved r[14] */ + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + SIZE(bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba4 + TYPE(bn_sqr_comba4,@function) +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + /* saved r[6] */ + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + SIZE(bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4) +.ident "desasm.pl" diff --git a/crypto/bn/asm/ca.pl b/crypto/bn/asm/ca.pl new file mode 100644 index 0000000000..181d1f007e --- /dev/null +++ b/crypto/bn/asm/ca.pl @@ -0,0 +1,33 @@ +#!/usr/local/bin/perl +# I have this in perl so I can use more usefull register names and then convert +# them into alpha registers. +# + +push(@INC,"perlasm","../../perlasm"); +require "alpha.pl"; +require "alpha/mul_add.pl"; +require "alpha/mul.pl"; +require "alpha/sqr.pl"; +require "alpha/add.pl"; +require "alpha/sub.pl"; +require "alpha/mul_c8.pl"; +require "alpha/mul_c4.pl"; +require "alpha/sqr_c4.pl"; +require "alpha/sqr_c8.pl"; +require "alpha/div.pl"; + +&asm_init($ARGV[0],"bn-586.pl"); + +&bn_mul_words("bn_mul_words"); +&bn_sqr_words("bn_sqr_words"); +&bn_mul_add_words("bn_mul_add_words"); +&bn_add_words("bn_add_words"); +&bn_sub_words("bn_sub_words"); +&bn_div_words("bn_div_words"); +&bn_mul_comba8("bn_mul_comba8"); +&bn_mul_comba4("bn_mul_comba4"); +&bn_sqr_comba4("bn_sqr_comba4"); +&bn_sqr_comba8("bn_sqr_comba8"); + +&asm_finish(); + diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl new file mode 100644 index 0000000000..0bcb5a6d47 --- /dev/null +++ b/crypto/bn/asm/co-586.pl @@ -0,0 +1,286 @@ +#!/usr/local/bin/perl + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"bn-586.pl"); + +&bn_mul_comba("bn_mul_comba8",8); +&bn_mul_comba("bn_mul_comba4",4); +&bn_sqr_comba("bn_sqr_comba8",8); +&bn_sqr_comba("bn_sqr_comba4",4); + +&asm_finish(); + +sub mul_add_c + { + local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("mul a[$ai]*b[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + &mul("edx"); + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a + &mov("eax",&wparam(0)) if $pos > 0; # load r[] + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a + } + +sub sqr_add_c + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + } + +sub sqr_add_c2 + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$a,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add("eax","eax"); + ### + &adc("edx","edx"); + ### + &adc($c2,0); + &add($c0,"eax"); + &adc($c1,"edx"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + &adc($c2,0); + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); + ### + } + +sub bn_mul_comba + { + local($name,$num)=@_; + local($a,$b,$c0,$c1,$c2); + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($tot,$end); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $b="edi"; + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + &push("esi"); + &mov($a,&wparam(1)); + &push("edi"); + &mov($b,&wparam(2)); + &push("ebp"); + &push("ebx"); + + &xor($c0,$c0); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + &xor($c1,$c1); + &mov("edx",&DWP(0,$b,"",0)); # load the first second + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("################## Calculate word $i"); + + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($j+1) == $end) + { + $v=1; + $v=2 if (($i+1) == $tot); + } + else + { $v=0; } + if (($j+1) != $end) + { + $na=($ai-1); + $nb=($bi+1); + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } +#printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; + &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); + if ($v) + { + &comment("saved r[$i]"); + # &mov("eax",&wparam(0)); + # &mov(&DWP($i*4,"eax","",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &comment("save r[$i]"); + # &mov("eax",&wparam(0)); + &mov(&DWP($i*4,"eax","",0),$c0); + + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } + +sub bn_sqr_comba + { + local($name,$num)=@_; + local($r,$a,$c0,$c1,$c2)=@_; + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($b,$tot,$end,$half); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $r="edi"; + + &push("esi"); + &push("edi"); + &push("ebp"); + &push("ebx"); + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &xor($c0,$c0); + &xor($c1,$c1); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("############### Calculate word $i"); + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($ai-1) < ($bi+1)) + { + $v=1; + $v=2 if ($i+1) == $tot; + } + else + { $v=0; } + if (!$v) + { + $na=$ai-1; + $nb=$bi+1; + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } + if ($ai == $bi) + { + &sqr_add_c($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + else + { + &sqr_add_c2($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + if ($v) + { + &comment("saved r[$i]"); + #&mov(&DWP($i*4,$r,"",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + last; + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &mov(&DWP($i*4,$r,"",0),$c0); + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } diff --git a/crypto/bn/asm/co-alpha.pl b/crypto/bn/asm/co-alpha.pl new file mode 100644 index 0000000000..23869a4ef5 --- /dev/null +++ b/crypto/bn/asm/co-alpha.pl @@ -0,0 +1,116 @@ +#!/usr/local/bin/perl +# I have this in perl so I can use more usefull register names and then convert +# them into alpha registers. +# + +push(@INC,"perlasm","../../perlasm"); +require "alpha.pl"; + +&asm_init($ARGV[0],"bn-586.pl"); + +print &bn_sub_words("bn_sub_words"); + +&asm_finish(); + +sub bn_sub_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + $cc="r0"; + $a0="r1"; $b0="r5"; $r0="r9"; $tmp="r13"; + $a1="r2"; $b1="r6"; $r1="r10"; $t1="r14"; + $a2="r3"; $b2="r7"; $r2="r11"; + $a3="r4"; $b3="r8"; $r3="r12"; $t3="r15"; + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + &ld($a1,&QWPw(1,$ap)); + &cmpult($a0,$b0,$tmp); # will we borrow? + &ld($b1,&QWPw(1,$bp)); + &sub($a0,$b0,$a0); # do the subtract + &ld($a2,&QWPw(2,$ap)); + &cmpult($a0,$cc,$b0); # will we borrow? + &ld($b2,&QWPw(2,$bp)); + &sub($a0,$cc,$a0); # will we borrow? + &ld($a3,&QWPw(3,$ap)); + &add($b0,$tmp,$cc); # add the borrows + + &cmpult($a1,$b1,$t1); # will we borrow? + &sub($a1,$b1,$a1); # do the subtract + &ld($b3,&QWPw(3,$bp)); + &cmpult($a1,$cc,$b1); # will we borrow? + &sub($a1,$cc,$a1); # will we borrow? + &add($b1,$t1,$cc); # add the borrows + + &cmpult($a2,$b2,$tmp); # will we borrow? + &sub($a2,$b2,$a2); # do the subtract + &st($a0,&QWPw(0,$rp)); # save + &cmpult($a2,$cc,$b2); # will we borrow? + &sub($a2,$cc,$a2); # will we borrow? + &add($b2,$tmp,$cc); # add the borrows + + &cmpult($a3,$b3,$t3); # will we borrow? + &sub($a3,$b3,$a3); # do the subtract + &st($a1,&QWPw(1,$rp)); # save + &cmpult($a3,$cc,$b3); # will we borrow? + &sub($a3,$cc,$a3); # will we borrow? + &add($b3,$t3,$cc); # add the borrows + + &st($a2,&QWPw(2,$rp)); # save + &sub($count,4,$count); # count-=4 + &st($a3,&QWPw(3,$rp)); # save + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &cmpult($a0,$b0,$tmp); # will we borrow? + &sub($a0,$b0,$a0); # do the subtract + &cmpult($a0,$cc,$b0); # will we borrow? + &sub($a0,$cc,$a0); # will we borrow? + &st($a0,&QWPw(0,$rp)); # save + &add($b0,$tmp,$cc); # add the borrows + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + } + diff --git a/crypto/bn/asm/co86unix.cpp b/crypto/bn/asm/co86unix.cpp new file mode 100644 index 0000000000..fa80b14046 --- /dev/null +++ b/crypto/bn/asm/co86unix.cpp @@ -0,0 +1,1315 @@ +/* Run the C pre-processor over this file with one of the following defined + * ELF - elf object files, + * OUT - a.out object files, + * BSDI - BSDI style a.out object files + * SOL - Solaris style elf + */ + +#define TYPE(a,b) .type a,b +#define SIZE(a,b) .size a,b + +#if defined(OUT) || defined(BSDI) +#define bn_mul_comba8 _bn_mul_comba8 +#define bn_mul_comba4 _bn_mul_comba4 +#define bn_sqr_comba8 _bn_sqr_comba8 +#define bn_sqr_comba4 _bn_sqr_comba4 + +#endif + +#ifdef OUT +#define OK 1 +#define ALIGN 4 +#endif + +#ifdef BSDI +#define OK 1 +#define ALIGN 4 +#undef SIZE +#undef TYPE +#define SIZE(a,b) +#define TYPE(a,b) +#endif + +#if defined(ELF) || defined(SOL) +#define OK 1 +#define ALIGN 16 +#endif + +#ifndef OK +You need to define one of +ELF - elf systems - linux-elf, NetBSD and DG-UX +OUT - a.out systems - linux-a.out and FreeBSD +SOL - solaris systems, which are elf with strange comment lines +BSDI - a.out with a very primative version of as. +#endif + +/* Let the Assembler begin :-) */ + /* Don't even think of reading this code */ + /* It was automatically generated by bn-586.pl */ + /* Which is a perl program used to generate the x86 assember for */ + /* any of elf, a.out, BSDI,Win32, or Solaris */ + /* eric <eay@cryptsoft.com> */ + + .file "bn-586.s" + .version "01.01" +gcc2_compiled.: +.text + .align ALIGN +.globl bn_mul_comba8 + TYPE(bn_mul_comba8,@function) +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[4]*b[0] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[5]*b[0] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[1] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[4] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[6]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[1] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[2] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[4] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[5] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + /* saved r[6] */ + /* ################## Calculate word 7 */ + xorl %ebx, %ebx + /* mul a[7]*b[0] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[2] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[3] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[4] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[5] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[6] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + /* saved r[7] */ + /* ################## Calculate word 8 */ + xorl %ecx, %ecx + /* mul a[7]*b[1] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[3] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[4] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[5] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[6] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + /* saved r[8] */ + /* ################## Calculate word 9 */ + xorl %ebp, %ebp + /* mul a[7]*b[2] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[4] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[5] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[6] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + /* saved r[9] */ + /* ################## Calculate word 10 */ + xorl %ebx, %ebx + /* mul a[7]*b[3] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[5] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[6] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + /* saved r[10] */ + /* ################## Calculate word 11 */ + xorl %ecx, %ecx + /* mul a[7]*b[4] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[6] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + /* saved r[11] */ + /* ################## Calculate word 12 */ + xorl %ebp, %ebp + /* mul a[7]*b[5] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + /* saved r[12] */ + /* ################## Calculate word 13 */ + xorl %ebx, %ebx + /* mul a[7]*b[6] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + /* saved r[13] */ + /* ################## Calculate word 14 */ + xorl %ecx, %ecx + /* mul a[7]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + /* saved r[14] */ + /* save r[15] */ + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + SIZE(bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_mul_comba4 + TYPE(bn_mul_comba4,@function) +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + /* saved r[6] */ + /* save r[7] */ + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + SIZE(bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba8 + TYPE(bn_sqr_comba8,@function) +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[4]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[5]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + /* sqr a[4]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[6]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[5]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + /* sqr a[4]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + /* saved r[6] */ + /* ############### Calculate word 7 */ + xorl %ebx, %ebx + /* sqr a[7]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[6]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + /* sqr a[5]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + /* sqr a[4]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + /* saved r[7] */ + /* ############### Calculate word 8 */ + xorl %ecx, %ecx + /* sqr a[7]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[6]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + /* sqr a[5]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + /* sqr a[4]*a[4] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + /* saved r[8] */ + /* ############### Calculate word 9 */ + xorl %ebp, %ebp + /* sqr a[7]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + /* sqr a[6]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + /* sqr a[5]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + /* saved r[9] */ + /* ############### Calculate word 10 */ + xorl %ebx, %ebx + /* sqr a[7]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + /* sqr a[6]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + /* sqr a[5]*a[5] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + /* saved r[10] */ + /* ############### Calculate word 11 */ + xorl %ecx, %ecx + /* sqr a[7]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + /* sqr a[6]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + /* saved r[11] */ + /* ############### Calculate word 12 */ + xorl %ebp, %ebp + /* sqr a[7]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + /* sqr a[6]*a[6] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + /* saved r[12] */ + /* ############### Calculate word 13 */ + xorl %ebx, %ebx + /* sqr a[7]*a[6] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + /* saved r[13] */ + /* ############### Calculate word 14 */ + xorl %ecx, %ecx + /* sqr a[7]*a[7] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + /* saved r[14] */ + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + SIZE(bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba4 + TYPE(bn_sqr_comba4,@function) +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + /* saved r[6] */ + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + SIZE(bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4) +.ident "desasm.pl" diff --git a/crypto/bn/asm/elf.s b/crypto/bn/asm/elf.s new file mode 100644 index 0000000000..97ad1264db --- /dev/null +++ b/crypto/bn/asm/elf.s @@ -0,0 +1,1269 @@ + # Don't even think of reading this code + # It was automatically generated by bn-586.pl + # Which is a perl program used to generate the x86 assember for + # any of elf, a.out, BSDI,Win32, or Solaris + # eric <eay@cryptsoft.com> + + .file "bn-586.s" + .version "01.01" +gcc2_compiled.: +.text + .align 16 +.globl bn_mul_comba8 + .type bn_mul_comba8,@function +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[4]*b[0] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[5]*b[0] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[1] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[4] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[6]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[1] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[2] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[4] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[5] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + # saved r[6] + # ################## Calculate word 7 + xorl %ebx, %ebx + # mul a[7]*b[0] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[2] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[3] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[4] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[5] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[6] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + # saved r[7] + # ################## Calculate word 8 + xorl %ecx, %ecx + # mul a[7]*b[1] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[3] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[4] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[5] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[6] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + # saved r[8] + # ################## Calculate word 9 + xorl %ebp, %ebp + # mul a[7]*b[2] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[4] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[5] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[6] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + # saved r[9] + # ################## Calculate word 10 + xorl %ebx, %ebx + # mul a[7]*b[3] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[5] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[6] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + # saved r[10] + # ################## Calculate word 11 + xorl %ecx, %ecx + # mul a[7]*b[4] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[6] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + # saved r[11] + # ################## Calculate word 12 + xorl %ebp, %ebp + # mul a[7]*b[5] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + # saved r[12] + # ################## Calculate word 13 + xorl %ebx, %ebx + # mul a[7]*b[6] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + # saved r[13] + # ################## Calculate word 14 + xorl %ecx, %ecx + # mul a[7]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + # saved r[14] + # save r[15] + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + .size bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_mul_comba4 + .type bn_mul_comba4,@function +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + # saved r[6] + # save r[7] + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + .size bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba8 + .type bn_sqr_comba8,@function +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[4]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[5]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + # sqr a[4]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[6]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[5]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + # sqr a[4]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + # saved r[6] + # ############### Calculate word 7 + xorl %ebx, %ebx + # sqr a[7]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[6]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + # sqr a[5]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + # sqr a[4]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + # saved r[7] + # ############### Calculate word 8 + xorl %ecx, %ecx + # sqr a[7]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[6]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + # sqr a[5]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + # sqr a[4]*a[4] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + # saved r[8] + # ############### Calculate word 9 + xorl %ebp, %ebp + # sqr a[7]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + # sqr a[6]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + # sqr a[5]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + # saved r[9] + # ############### Calculate word 10 + xorl %ebx, %ebx + # sqr a[7]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + # sqr a[6]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + # sqr a[5]*a[5] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + # saved r[10] + # ############### Calculate word 11 + xorl %ecx, %ecx + # sqr a[7]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + # sqr a[6]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + # saved r[11] + # ############### Calculate word 12 + xorl %ebp, %ebp + # sqr a[7]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + # sqr a[6]*a[6] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + # saved r[12] + # ############### Calculate word 13 + xorl %ebx, %ebx + # sqr a[7]*a[6] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + # saved r[13] + # ############### Calculate word 14 + xorl %ecx, %ecx + # sqr a[7]*a[7] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + # saved r[14] + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + .size bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba4 + .type bn_sqr_comba4,@function +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + # saved r[6] + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + .size bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4 +.ident "desasm.pl" diff --git a/crypto/bn/asm/f b/crypto/bn/asm/f new file mode 100644 index 0000000000..a23fa159b2 --- /dev/null +++ b/crypto/bn/asm/f @@ -0,0 +1,500 @@ + .text + .align 3 + .globl bn_sqr_comba8 + .ent bn_sqr_comba8 +bn_sqr_comba8: +bn_sqr_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + ldq $4, 32($17) + ldq $5, 40($17) + ldq $6, 48($17) + ldq $7, 56($17) + bis $31, $31, $23 + mulq $0, $0, $8 + umulh $0, $0, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $1, $0, $24 + umulh $1, $0, $25 + cmplt $24, $31, $27 + cmplt $25, $31, $28 + addq $24, $24, $24 + addq $25, $25, $25 + addq $25, $27, $25 + addq $8, $28, $8 + addq $22, $24, $22 + addq $23, $25, $23 + cmpult $22, $24, $21 + cmpult $23, $25, $20 + addq $23, $21, $23 + addq $8, $20, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $1, $1, $19 + umulh $1, $1, $18 + addq $23, $19, $23 + addq $8, $18, $8 + cmpult $23, $19, $17 + cmpult $8, $18, $27 + addq $8, $17, $8 + addq $22, $27, $22 + mulq $2, $0, $28 + umulh $2, $0, $24 + cmplt $28, $31, $25 + cmplt $24, $31, $21 + addq $28, $28, $28 + addq $24, $24, $24 + addq $24, $25, $24 + addq $22, $21, $22 + addq $23, $28, $23 + addq $8, $24, $8 + cmpult $23, $28, $20 + cmpult $8, $24, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $2, $1, $18 + umulh $2, $1, $17 + cmplt $18, $31, $27 + cmplt $17, $31, $25 + addq $18, $18, $18 + addq $17, $17, $17 + addq $17, $27, $17 + addq $23, $25, $23 + addq $8, $18, $8 + addq $22, $17, $22 + cmpult $8, $18, $21 + cmpult $22, $17, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $3, $0, $24 + umulh $3, $0, $20 + cmplt $24, $31, $19 + cmplt $20, $31, $27 + addq $24, $24, $24 + addq $20, $20, $20 + addq $20, $19, $20 + addq $23, $27, $23 + addq $8, $24, $8 + addq $22, $20, $22 + cmpult $8, $24, $25 + cmpult $22, $20, $18 + addq $22, $25, $22 + addq $23, $18, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $2, $17 + umulh $2, $2, $21 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $28 + cmpult $23, $21, $19 + addq $23, $28, $23 + addq $8, $19, $8 + mulq $3, $1, $27 + umulh $3, $1, $24 + cmplt $27, $31, $20 + cmplt $24, $31, $25 + addq $27, $27, $27 + addq $24, $24, $24 + addq $24, $20, $24 + addq $8, $25, $8 + addq $22, $27, $22 + addq $23, $24, $23 + cmpult $22, $27, $18 + cmpult $23, $24, $17 + addq $23, $18, $23 + addq $8, $17, $8 + mulq $4, $0, $21 + umulh $4, $0, $28 + cmplt $21, $31, $19 + cmplt $28, $31, $20 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $19, $28 + addq $8, $20, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $25 + cmpult $23, $28, $27 + addq $23, $25, $23 + addq $8, $27, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $3, $2, $24 + umulh $3, $2, $18 + cmplt $24, $31, $17 + cmplt $18, $31, $19 + addq $24, $24, $24 + addq $18, $18, $18 + addq $18, $17, $18 + addq $22, $19, $22 + addq $23, $24, $23 + addq $8, $18, $8 + cmpult $23, $24, $20 + cmpult $8, $18, $21 + addq $8, $20, $8 + addq $22, $21, $22 + mulq $4, $1, $28 + umulh $4, $1, $25 + cmplt $28, $31, $27 + cmplt $25, $31, $17 + addq $28, $28, $28 + addq $25, $25, $25 + addq $25, $27, $25 + addq $22, $17, $22 + addq $23, $28, $23 + addq $8, $25, $8 + cmpult $23, $28, $19 + cmpult $8, $25, $24 + addq $8, $19, $8 + addq $22, $24, $22 + mulq $5, $0, $18 + umulh $5, $0, $20 + cmplt $18, $31, $21 + cmplt $20, $31, $27 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $21, $20 + addq $22, $27, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $28 + addq $8, $17, $8 + addq $22, $28, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $3, $3, $25 + umulh $3, $3, $19 + addq $8, $25, $8 + addq $22, $19, $22 + cmpult $8, $25, $24 + cmpult $22, $19, $21 + addq $22, $24, $22 + addq $23, $21, $23 + mulq $4, $2, $27 + umulh $4, $2, $18 + cmplt $27, $31, $20 + cmplt $18, $31, $17 + addq $27, $27, $27 + addq $18, $18, $18 + addq $18, $20, $18 + addq $23, $17, $23 + addq $8, $27, $8 + addq $22, $18, $22 + cmpult $8, $27, $28 + cmpult $22, $18, $25 + addq $22, $28, $22 + addq $23, $25, $23 + mulq $5, $1, $19 + umulh $5, $1, $24 + cmplt $19, $31, $21 + cmplt $24, $31, $20 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $21, $24 + addq $23, $20, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $17 + cmpult $22, $24, $27 + addq $22, $17, $22 + addq $23, $27, $23 + mulq $6, $0, $18 + umulh $6, $0, $28 + cmplt $18, $31, $25 + cmplt $28, $31, $21 + addq $18, $18, $18 + addq $28, $28, $28 + addq $28, $25, $28 + addq $23, $21, $23 + addq $8, $18, $8 + addq $22, $28, $22 + cmpult $8, $18, $20 + cmpult $22, $28, $19 + addq $22, $20, $22 + addq $23, $19, $23 + stq $8, 48($16) + bis $31, $31, $8 + mulq $4, $3, $24 + umulh $4, $3, $17 + cmplt $24, $31, $27 + cmplt $17, $31, $25 + addq $24, $24, $24 + addq $17, $17, $17 + addq $17, $27, $17 + addq $8, $25, $8 + addq $22, $24, $22 + addq $23, $17, $23 + cmpult $22, $24, $21 + cmpult $23, $17, $18 + addq $23, $21, $23 + addq $8, $18, $8 + mulq $5, $2, $28 + umulh $5, $2, $20 + cmplt $28, $31, $19 + cmplt $20, $31, $27 + addq $28, $28, $28 + addq $20, $20, $20 + addq $20, $19, $20 + addq $8, $27, $8 + addq $22, $28, $22 + addq $23, $20, $23 + cmpult $22, $28, $25 + cmpult $23, $20, $24 + addq $23, $25, $23 + addq $8, $24, $8 + mulq $6, $1, $17 + umulh $6, $1, $21 + cmplt $17, $31, $18 + cmplt $21, $31, $19 + addq $17, $17, $17 + addq $21, $21, $21 + addq $21, $18, $21 + addq $8, $19, $8 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $27 + cmpult $23, $21, $28 + addq $23, $27, $23 + addq $8, $28, $8 + mulq $7, $0, $20 + umulh $7, $0, $25 + cmplt $20, $31, $24 + cmplt $25, $31, $18 + addq $20, $20, $20 + addq $25, $25, $25 + addq $25, $24, $25 + addq $8, $18, $8 + addq $22, $20, $22 + addq $23, $25, $23 + cmpult $22, $20, $19 + cmpult $23, $25, $17 + addq $23, $19, $23 + addq $8, $17, $8 + stq $22, 56($16) + bis $31, $31, $22 + mulq $4, $4, $21 + umulh $4, $4, $27 + addq $23, $21, $23 + addq $8, $27, $8 + cmpult $23, $21, $28 + cmpult $8, $27, $24 + addq $8, $28, $8 + addq $22, $24, $22 + mulq $5, $3, $18 + umulh $5, $3, $20 + cmplt $18, $31, $25 + cmplt $20, $31, $19 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $25, $20 + addq $22, $19, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $21 + addq $8, $17, $8 + addq $22, $21, $22 + mulq $6, $2, $27 + umulh $6, $2, $28 + cmplt $27, $31, $24 + cmplt $28, $31, $25 + addq $27, $27, $27 + addq $28, $28, $28 + addq $28, $24, $28 + addq $22, $25, $22 + addq $23, $27, $23 + addq $8, $28, $8 + cmpult $23, $27, $19 + cmpult $8, $28, $18 + addq $8, $19, $8 + addq $22, $18, $22 + mulq $7, $1, $20 + umulh $7, $1, $17 + cmplt $20, $31, $21 + cmplt $17, $31, $24 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $21, $17 + addq $22, $24, $22 + addq $23, $20, $23 + addq $8, $17, $8 + cmpult $23, $20, $25 + cmpult $8, $17, $27 + addq $8, $25, $8 + addq $22, $27, $22 + stq $23, 64($16) + bis $31, $31, $23 + mulq $5, $4, $28 + umulh $5, $4, $19 + cmplt $28, $31, $18 + cmplt $19, $31, $21 + addq $28, $28, $28 + addq $19, $19, $19 + addq $19, $18, $19 + addq $23, $21, $23 + addq $8, $28, $8 + addq $22, $19, $22 + cmpult $8, $28, $24 + cmpult $22, $19, $20 + addq $22, $24, $22 + addq $23, $20, $23 + mulq $6, $3, $17 + umulh $6, $3, $25 + cmplt $17, $31, $27 + cmplt $25, $31, $18 + addq $17, $17, $17 + addq $25, $25, $25 + addq $25, $27, $25 + addq $23, $18, $23 + addq $8, $17, $8 + addq $22, $25, $22 + cmpult $8, $17, $21 + cmpult $22, $25, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $7, $2, $19 + umulh $7, $2, $24 + cmplt $19, $31, $20 + cmplt $24, $31, $27 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $20, $24 + addq $23, $27, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $18 + cmpult $22, $24, $17 + addq $22, $18, $22 + addq $23, $17, $23 + stq $8, 72($16) + bis $31, $31, $8 + mulq $5, $5, $25 + umulh $5, $5, $21 + addq $22, $25, $22 + addq $23, $21, $23 + cmpult $22, $25, $28 + cmpult $23, $21, $20 + addq $23, $28, $23 + addq $8, $20, $8 + mulq $6, $4, $27 + umulh $6, $4, $19 + cmplt $27, $31, $24 + cmplt $19, $31, $18 + addq $27, $27, $27 + addq $19, $19, $19 + addq $19, $24, $19 + addq $8, $18, $8 + addq $22, $27, $22 + addq $23, $19, $23 + cmpult $22, $27, $17 + cmpult $23, $19, $25 + addq $23, $17, $23 + addq $8, $25, $8 + mulq $7, $3, $21 + umulh $7, $3, $28 + cmplt $21, $31, $20 + cmplt $28, $31, $24 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $20, $28 + addq $8, $24, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $18 + cmpult $23, $28, $27 + addq $23, $18, $23 + addq $8, $27, $8 + stq $22, 80($16) + bis $31, $31, $22 + mulq $6, $5, $19 + umulh $6, $5, $17 + cmplt $19, $31, $25 + cmplt $17, $31, $20 + addq $19, $19, $19 + addq $17, $17, $17 + addq $17, $25, $17 + addq $22, $20, $22 + addq $23, $19, $23 + addq $8, $17, $8 + cmpult $23, $19, $24 + cmpult $8, $17, $21 + addq $8, $24, $8 + addq $22, $21, $22 + mulq $7, $4, $28 + umulh $7, $4, $18 + cmplt $28, $31, $27 + cmplt $18, $31, $25 + addq $28, $28, $28 + addq $18, $18, $18 + addq $18, $27, $18 + addq $22, $25, $22 + addq $23, $28, $23 + addq $8, $18, $8 + cmpult $23, $28, $20 + cmpult $8, $18, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 88($16) + bis $31, $31, $23 + mulq $6, $6, $17 + umulh $6, $6, $24 + addq $8, $17, $8 + addq $22, $24, $22 + cmpult $8, $17, $21 + cmpult $22, $24, $27 + addq $22, $21, $22 + addq $23, $27, $23 + mulq $7, $5, $25 + umulh $7, $5, $28 + cmplt $25, $31, $18 + cmplt $28, $31, $20 + addq $25, $25, $25 + addq $28, $28, $28 + addq $28, $18, $28 + addq $23, $20, $23 + addq $8, $25, $8 + addq $22, $28, $22 + cmpult $8, $25, $19 + cmpult $22, $28, $17 + addq $22, $19, $22 + addq $23, $17, $23 + stq $8, 96($16) + bis $31, $31, $8 + mulq $7, $6, $24 + umulh $7, $6, $21 + cmplt $24, $31, $27 + cmplt $21, $31, $18 + addq $24, $24, $24 + addq $21, $21, $21 + addq $21, $27, $21 + addq $8, $18, $8 + addq $22, $24, $22 + addq $23, $21, $23 + cmpult $22, $24, $20 + cmpult $23, $21, $25 + addq $23, $20, $23 + addq $8, $25, $8 + stq $22, 104($16) + bis $31, $31, $22 + mulq $7, $7, $28 + umulh $7, $7, $19 + addq $23, $28, $23 + addq $8, $19, $8 + cmpult $23, $28, $17 + cmpult $8, $19, $27 + addq $8, $17, $8 + addq $22, $27, $22 + stq $23, 112($16) + stq $8, 120($16) + ret $31,($26),1 + .end bn_sqr_comba8 diff --git a/crypto/bn/asm/f.c b/crypto/bn/asm/f.c new file mode 100644 index 0000000000..bfdccae4a0 --- /dev/null +++ b/crypto/bn/asm/f.c @@ -0,0 +1,8 @@ +int abc(a,b,c,d,e,f,g,h,i,j) +unsigned long a,b,c,d,e,f,g,h,i,j; + { + gg(g); + if (g) + gg(h); + gg(i); + } diff --git a/crypto/bn/asm/f.elf b/crypto/bn/asm/f.elf new file mode 100644 index 0000000000..39d07b79e1 --- /dev/null +++ b/crypto/bn/asm/f.elf @@ -0,0 +1,2149 @@ + # Don't even think of reading this code + # It was automatically generated by bn-586.pl + # Which is a perl program used to generate the x86 assember for + # any of elf, a.out, BSDI,Win32, or Solaris + # eric <eay@cryptsoft.com> + + .file "bn-586.s" + .version "01.01" +gcc2_compiled.: +.text + .align 16 +.globl bn_mul_add_words + .type bn_mul_add_words,@function +bn_mul_add_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + xorl %esi, %esi + movl 20(%esp), %edi + movl 28(%esp), %ecx + movl 24(%esp), %ebx + andl $4294967288, %ecx + movl 32(%esp), %ebp + pushl %ecx + jz .L000maw_finish +.L001maw_loop: + movl %ecx, (%esp) + # Round 0 + movl (%ebx), %eax + mull %ebp + addl %esi, %eax + movl (%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, (%edi) + movl %edx, %esi + # Round 4 + movl 4(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 4(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 4(%edi) + movl %edx, %esi + # Round 8 + movl 8(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 8(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 8(%edi) + movl %edx, %esi + # Round 12 + movl 12(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 12(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 12(%edi) + movl %edx, %esi + # Round 16 + movl 16(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 16(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 16(%edi) + movl %edx, %esi + # Round 20 + movl 20(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 20(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 20(%edi) + movl %edx, %esi + # Round 24 + movl 24(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 24(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi + # Round 28 + movl 28(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 28(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 28(%edi) + movl %edx, %esi + + movl (%esp), %ecx + addl $32, %ebx + addl $32, %edi + subl $8, %ecx + jnz .L001maw_loop +.L000maw_finish: + movl 32(%esp), %ecx + andl $7, %ecx + jnz .L002maw_finish2 + jmp .L003maw_end +.align 16 +.L002maw_finish2: + # Tail Round 0 + movl (%ebx), %eax + mull %ebp + addl %esi, %eax + movl (%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, (%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 1 + movl 4(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 4(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 4(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 2 + movl 8(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 8(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 8(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 3 + movl 12(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 12(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 12(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 4 + movl 16(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 16(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 16(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 5 + movl 20(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 20(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 20(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 6 + movl 24(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 24(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi +.L003maw_end: + movl %esi, %eax + popl %ecx + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_mul_add_words_end: + .size bn_mul_add_words,.bn_mul_add_words_end-bn_mul_add_words +.ident "bn_mul_add_words" +.text + .align 16 +.globl bn_mul_words + .type bn_mul_words,@function +bn_mul_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + xorl %esi, %esi + movl 20(%esp), %edi + movl 24(%esp), %ebx + movl 28(%esp), %ebp + movl 32(%esp), %ecx + andl $4294967288, %ebp + jz .L004mw_finish +.L005mw_loop: + # Round 0 + movl (%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, (%edi) + movl %edx, %esi + # Round 4 + movl 4(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 4(%edi) + movl %edx, %esi + # Round 8 + movl 8(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 8(%edi) + movl %edx, %esi + # Round 12 + movl 12(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 12(%edi) + movl %edx, %esi + # Round 16 + movl 16(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 16(%edi) + movl %edx, %esi + # Round 20 + movl 20(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 20(%edi) + movl %edx, %esi + # Round 24 + movl 24(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi + # Round 28 + movl 28(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 28(%edi) + movl %edx, %esi + + addl $32, %ebx + addl $32, %edi + subl $8, %ebp + jz .L004mw_finish + jmp .L005mw_loop +.L004mw_finish: + movl 28(%esp), %ebp + andl $7, %ebp + jnz .L006mw_finish2 + jmp .L007mw_end +.align 16 +.L006mw_finish2: + # Tail Round 0 + movl (%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, (%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 1 + movl 4(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 4(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 2 + movl 8(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 8(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 3 + movl 12(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 12(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 4 + movl 16(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 16(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 5 + movl 20(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 20(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 6 + movl 24(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi +.L007mw_end: + movl %esi, %eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_mul_words_end: + .size bn_mul_words,.bn_mul_words_end-bn_mul_words +.ident "bn_mul_words" +.text + .align 16 +.globl bn_sqr_words + .type bn_sqr_words,@function +bn_sqr_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %esi + movl 24(%esp), %edi + movl 28(%esp), %ebx + andl $4294967288, %ebx + jz .L008sw_finish +.L009sw_loop: + # Round 0 + movl (%edi), %eax + mull %eax + movl %eax, (%esi) + movl %edx, 4(%esi) + # Round 4 + movl 4(%edi), %eax + mull %eax + movl %eax, 8(%esi) + movl %edx, 12(%esi) + # Round 8 + movl 8(%edi), %eax + mull %eax + movl %eax, 16(%esi) + movl %edx, 20(%esi) + # Round 12 + movl 12(%edi), %eax + mull %eax + movl %eax, 24(%esi) + movl %edx, 28(%esi) + # Round 16 + movl 16(%edi), %eax + mull %eax + movl %eax, 32(%esi) + movl %edx, 36(%esi) + # Round 20 + movl 20(%edi), %eax + mull %eax + movl %eax, 40(%esi) + movl %edx, 44(%esi) + # Round 24 + movl 24(%edi), %eax + mull %eax + movl %eax, 48(%esi) + movl %edx, 52(%esi) + # Round 28 + movl 28(%edi), %eax + mull %eax + movl %eax, 56(%esi) + movl %edx, 60(%esi) + + addl $32, %edi + addl $64, %esi + subl $8, %ebx + jnz .L009sw_loop +.L008sw_finish: + movl 28(%esp), %ebx + andl $7, %ebx + jz .L010sw_end + # Tail Round 0 + movl (%edi), %eax + mull %eax + movl %eax, (%esi) + decl %ebx + movl %edx, 4(%esi) + jz .L010sw_end + # Tail Round 1 + movl 4(%edi), %eax + mull %eax + movl %eax, 8(%esi) + decl %ebx + movl %edx, 12(%esi) + jz .L010sw_end + # Tail Round 2 + movl 8(%edi), %eax + mull %eax + movl %eax, 16(%esi) + decl %ebx + movl %edx, 20(%esi) + jz .L010sw_end + # Tail Round 3 + movl 12(%edi), %eax + mull %eax + movl %eax, 24(%esi) + decl %ebx + movl %edx, 28(%esi) + jz .L010sw_end + # Tail Round 4 + movl 16(%edi), %eax + mull %eax + movl %eax, 32(%esi) + decl %ebx + movl %edx, 36(%esi) + jz .L010sw_end + # Tail Round 5 + movl 20(%edi), %eax + mull %eax + movl %eax, 40(%esi) + decl %ebx + movl %edx, 44(%esi) + jz .L010sw_end + # Tail Round 6 + movl 24(%edi), %eax + mull %eax + movl %eax, 48(%esi) + movl %edx, 52(%esi) +.L010sw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_sqr_words_end: + .size bn_sqr_words,.bn_sqr_words_end-bn_sqr_words +.ident "bn_sqr_words" +.text + .align 16 +.globl bn_div64 + .type bn_div64,@function +bn_div64: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp), %edx + movl 24(%esp), %eax + movl 28(%esp), %ebx + divl %ebx + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_div64_end: + .size bn_div64,.bn_div64_end-bn_div64 +.ident "bn_div64" +.text + .align 16 +.globl bn_add_words + .type bn_add_words,@function +bn_add_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %ebx + movl 24(%esp), %esi + movl 28(%esp), %edi + movl 32(%esp), %ebp + xorl %eax, %eax + andl $4294967288, %ebp + jz .L011aw_finish +.L012aw_loop: + # Round 0 + movl (%esi), %ecx + movl (%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, (%ebx) + # Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 4(%ebx) + # Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 8(%ebx) + # Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 12(%ebx) + # Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 16(%ebx) + # Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 20(%ebx) + # Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) + # Round 7 + movl 28(%esi), %ecx + movl 28(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 28(%ebx) + + addl $32, %esi + addl $32, %edi + addl $32, %ebx + subl $8, %ebp + jnz .L012aw_loop +.L011aw_finish: + movl 32(%esp), %ebp + andl $7, %ebp + jz .L013aw_end + # Tail Round 0 + movl (%esi), %ecx + movl (%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, (%ebx) + jz .L013aw_end + # Tail Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 4(%ebx) + jz .L013aw_end + # Tail Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 8(%ebx) + jz .L013aw_end + # Tail Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 12(%ebx) + jz .L013aw_end + # Tail Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 16(%ebx) + jz .L013aw_end + # Tail Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 20(%ebx) + jz .L013aw_end + # Tail Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) +.L013aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_add_words_end: + .size bn_add_words,.bn_add_words_end-bn_add_words +.ident "bn_add_words" +.text + .align 16 +.globl bn_sub_words + .type bn_sub_words,@function +bn_sub_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %ebx + movl 24(%esp), %esi + movl 28(%esp), %edi + movl 32(%esp), %ebp + xorl %eax, %eax + andl $4294967288, %ebp + jz .L014aw_finish +.L015aw_loop: + # Round 0 + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, (%ebx) + # Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 4(%ebx) + # Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 8(%ebx) + # Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 12(%ebx) + # Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 16(%ebx) + # Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 20(%ebx) + # Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) + # Round 7 + movl 28(%esi), %ecx + movl 28(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 28(%ebx) + + addl $32, %esi + addl $32, %edi + addl $32, %ebx + subl $8, %ebp + jnz .L015aw_loop +.L014aw_finish: + movl 32(%esp), %ebp + andl $7, %ebp + jz .L016aw_end + # Tail Round 0 + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, (%ebx) + jz .L016aw_end + # Tail Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 4(%ebx) + jz .L016aw_end + # Tail Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 8(%ebx) + jz .L016aw_end + # Tail Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 12(%ebx) + jz .L016aw_end + # Tail Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 16(%ebx) + jz .L016aw_end + # Tail Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 20(%ebx) + jz .L016aw_end + # Tail Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) +.L016aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_sub_words_end: + .size bn_sub_words,.bn_sub_words_end-bn_sub_words +.ident "bn_sub_words" +.text + .align 16 +.globl bn_mul_comba8 + .type bn_mul_comba8,@function +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[4]*b[0] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[5]*b[0] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[1] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[4] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[6]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[1] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[2] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[4] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[5] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + # saved r[6] + # ################## Calculate word 7 + xorl %ebx, %ebx + # mul a[7]*b[0] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[2] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[3] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[4] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[5] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[6] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + # saved r[7] + # ################## Calculate word 8 + xorl %ecx, %ecx + # mul a[7]*b[1] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[3] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[4] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[5] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[6] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + # saved r[8] + # ################## Calculate word 9 + xorl %ebp, %ebp + # mul a[7]*b[2] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[4] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[5] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[6] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + # saved r[9] + # ################## Calculate word 10 + xorl %ebx, %ebx + # mul a[7]*b[3] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[5] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[6] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + # saved r[10] + # ################## Calculate word 11 + xorl %ecx, %ecx + # mul a[7]*b[4] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[6] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + # saved r[11] + # ################## Calculate word 12 + xorl %ebp, %ebp + # mul a[7]*b[5] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + # saved r[12] + # ################## Calculate word 13 + xorl %ebx, %ebx + # mul a[7]*b[6] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + # saved r[13] + # ################## Calculate word 14 + xorl %ecx, %ecx + # mul a[7]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + # saved r[14] + # save r[15] + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + .size bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_mul_comba4 + .type bn_mul_comba4,@function +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + # saved r[6] + # save r[7] + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + .size bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba8 + .type bn_sqr_comba8,@function +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[4]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[5]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + # sqr a[4]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[6]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[5]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + # sqr a[4]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + # saved r[6] + # ############### Calculate word 7 + xorl %ebx, %ebx + # sqr a[7]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[6]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + # sqr a[5]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + # sqr a[4]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + # saved r[7] + # ############### Calculate word 8 + xorl %ecx, %ecx + # sqr a[7]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[6]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + # sqr a[5]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + # sqr a[4]*a[4] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + # saved r[8] + # ############### Calculate word 9 + xorl %ebp, %ebp + # sqr a[7]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + # sqr a[6]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + # sqr a[5]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + # saved r[9] + # ############### Calculate word 10 + xorl %ebx, %ebx + # sqr a[7]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + # sqr a[6]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + # sqr a[5]*a[5] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + # saved r[10] + # ############### Calculate word 11 + xorl %ecx, %ecx + # sqr a[7]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + # sqr a[6]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + # saved r[11] + # ############### Calculate word 12 + xorl %ebp, %ebp + # sqr a[7]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + # sqr a[6]*a[6] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + # saved r[12] + # ############### Calculate word 13 + xorl %ebx, %ebx + # sqr a[7]*a[6] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + # saved r[13] + # ############### Calculate word 14 + xorl %ecx, %ecx + # sqr a[7]*a[7] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + # saved r[14] + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + .size bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba4 + .type bn_sqr_comba4,@function +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + # saved r[6] + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + .size bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4 +.ident "desasm.pl" diff --git a/crypto/bn/asm/f.s b/crypto/bn/asm/f.s new file mode 100644 index 0000000000..2f8f63c690 --- /dev/null +++ b/crypto/bn/asm/f.s @@ -0,0 +1,1773 @@ + # Don't even think of reading this code + # It was automatically generated by bn-586.pl + # Which is a perl program used to generate the alpha assember. + # eric <eay@cryptsoft.com> + + # DEC Alpha assember + # Generated from perl scripts contains in SSLeay + .file 1 "bn-586.s" + .set noat + .text + .align 3 + .globl bn_mul_words + .ent bn_mul_words +bn_mul_words: +bn_mul_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18, 4, $18 + bis $31, $31, $0 + br $100 + blt $18, $100 + ldq $1, 0($17) + ldq $2, 0($16) +$101: + ldq $3, 0($17) + mulq $3, $19, $4 + addq $17, 8, $17 + umulh $3, $19, $5 + addq $4, $0, $4 + addq $16, 8, $16 + subq $18, 1, $18 + cmpult $4, $0, $0 + stq $4, -8($16) + addq $5, $0, $0 + bgt $18, $101 + ret $31,($26),1 +$100: + addq $18, 4, $18 + bgt $18, $101 +$102: + ret $31,($26),1 + .end bn_mul_words + .text + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18, 4, $18 + bis $31, $31, $0 + br $103 + blt $18, $103 + ldq $1, 0($17) + ldq $2, 0($16) +$104: + ldq $3, 0($17) + mulq $3, $3, $4 + addq $17, 8, $17 + addq $16, 16, $16 + subq $18, 1, $18 + umulh $3, $3, $5 + stq $4, -16($16) + stq $5, -8($16) + bgt $18, $104 + ret $31,($26),1 +$103: + addq $18, 4, $18 + bgt $18, $104 +$105: + ret $31,($26),1 + .end bn_sqr_words + .text + .align 3 + .globl bn_mul_add_words + .ent bn_mul_add_words +bn_mul_add_words: +bn_mul_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18, 4, $18 + bis $31, $31, $0 + br $106 + blt $18, $106 + ldq $1, 0($17) + ldq $2, 0($16) +$107: + ldq $3, 0($17) + ldq $4, 0($16) + mulq $3, $19, $5 + subq $18, 1, $18 + addq $17, 8, $17 + umulh $3, $19, $6 + addq $4, $5, $4 + addq $16, 8, $16 + cmpult $4, $5, $7 + addq $4, $0, $4 + addq $6, $7, $6 + cmpult $4, $0, $0 + stq $4, -8($16) + addq $6, $0, $0 + bgt $18, $107 + ret $31,($26),1 +$106: + addq $18, 4, $18 + bgt $18, $107 +$108: + ret $31,($26),1 + .end bn_mul_add_words + .text + .align 3 + .globl bn_add_words + .ent bn_add_words +bn_add_words: +bn_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + br $109 + blt $19, $109 + ldq $1, 0($17) + ldq $2, 0($18) +$110: + ldq $3, 8($17) + ldq $4, 8($18) + ldq $5, 16($17) + ldq $6, 16($18) + ldq $7, 24($17) + ldq $8, 24($18) + addq $1, $2, $22 + cmpult $22, $2, $23 + addq $22, $0, $22 + cmpult $22, $0, $0 + addq $0, $23, $0 + addq $3, $4, $25 + cmpult $25, $4, $24 + addq $25, $0, $25 + cmpult $25, $0, $0 + addq $0, $24, $0 + addq $5, $6, $28 + cmpult $28, $6, $27 + addq $28, $0, $28 + cmpult $28, $0, $0 + addq $0, $27, $0 + addq $7, $8, $20 + cmpult $20, $8, $21 + addq $20, $0, $20 + cmpult $20, $0, $0 + addq $0, $21, $0 + stq $22, 0($16) + stq $25, 0($16) + stq $28, 0($16) + stq $20, 0($16) + subq $19, 4, $19 + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $109 + ldq $1, 0($17) + ldq $2, 0($18) + br $110 +$111: + ldq $1, 0($17) + ldq $2, 0($18) + addq $1, $2, $3 + cmpult $3, $2, $23 + addq $3, $0, $3 + cmpult $3, $0, $0 + addq $0, $23, $0 + stq $3, 0($16) + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $111 + ret $31,($26),1 +$109: + addq $19, 4, $19 + bgt $19, $111 +$112: + ret $31,($26),1 + .end bn_add_words + .text + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + blt $19, $113 + ldq $1, 0($17) + ldq $2, 0($18) +$114: + ldq $3, 8($17) + cmpult $1, $2, $4 + ldq $5, 8($18) + subq $1, $2, $1 + ldq $6, 16($17) + cmpult $1, $0, $2 + ldq $7, 16($18) + subq $1, $0, $23 + ldq $8, 24($17) + addq $2, $4, $0 + cmpult $3, $5, $24 + subq $3, $5, $3 + ldq $22, 24($18) + cmpult $3, $0, $5 + subq $3, $0, $25 + addq $5, $24, $0 + cmpult $6, $7, $27 + subq $6, $7, $6 + stq $23, 0($16) + cmpult $6, $0, $7 + subq $6, $0, $28 + addq $7, $27, $0 + cmpult $8, $22, $21 + subq $8, $22, $8 + stq $25, 8($16) + cmpult $8, $0, $22 + subq $8, $0, $20 + addq $22, $21, $0 + stq $28, 16($16) + subq $19, 4, $19 + stq $20, 24($16) + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $113 + ldq $1, 0($17) + ldq $2, 0($18) + br $114 +$115: + ldq $1, 0($17) + ldq $2, 0($18) + cmpult $1, $2, $27 + subq $1, $2, $1 + cmpult $1, $0, $2 + subq $1, $0, $1 + stq $1, 0($16) + addq $2, $27, $0 + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $115 + ret $31,($26),1 +$113: + addq $19, 4, $19 + bgt $19, $115 +$116: + ret $31,($26),1 + .end bn_sub_words + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 + .text + .align 3 + .globl bn_mul_comba8 + .ent bn_mul_comba8 +bn_mul_comba8: +bn_mul_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $30, 16, $30 + ldq $0, 0($17) + ldq $1, 0($18) + stq $9, 0($30) + stq $10, 8($30) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + ldq $8, 8($17) + ldq $22, 8($18) + ldq $23, 8($17) + ldq $24, 8($18) + ldq $25, 8($17) + ldq $27, 8($18) + ldq $28, 8($17) + ldq $21, 8($18) + bis $31, $31, $9 + mulq $0, $1, $20 + umulh $0, $1, $19 + stq $20, 0($16) + bis $31, $31, $10 + mulq $0, $3, $17 + umulh $0, $3, $18 + addq $19, $17, $19 + cmpult $19, $17, $20 + addq $20, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $17 + addq $10, $17, $10 + mulq $2, $1, $20 + umulh $2, $1, $18 + addq $19, $20, $19 + cmpult $19, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $20 + addq $10, $20, $10 + stq $19, 8($16) + bis $31, $31, $17 + mulq $0, $5, $18 + umulh $0, $5, $20 + addq $9, $18, $9 + cmpult $9, $18, $19 + addq $19, $20, $20 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $17, $18, $17 + mulq $2, $3, $19 + umulh $2, $3, $20 + addq $9, $19, $9 + cmpult $9, $19, $18 + addq $18, $20, $20 + addq $10, $20, $10 + cmpult $10, $20, $19 + addq $17, $19, $17 + mulq $4, $1, $18 + umulh $4, $1, $20 + addq $9, $18, $9 + cmpult $9, $18, $19 + addq $19, $20, $20 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $17, $18, $17 + stq $9, 16($16) + bis $31, $31, $19 + mulq $0, $7, $20 + umulh $0, $7, $18 + addq $10, $20, $10 + cmpult $10, $20, $9 + addq $9, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $20 + addq $19, $20, $19 + mulq $2, $5, $9 + umulh $2, $5, $18 + addq $10, $9, $10 + cmpult $10, $9, $20 + addq $20, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $9 + addq $19, $9, $19 + mulq $4, $3, $20 + umulh $4, $3, $18 + addq $10, $20, $10 + cmpult $10, $20, $9 + addq $9, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $20 + addq $19, $20, $19 + mulq $6, $1, $9 + umulh $6, $1, $18 + addq $10, $9, $10 + cmpult $10, $9, $20 + addq $20, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $9 + addq $19, $9, $19 + stq $10, 24($16) + bis $31, $31, $20 + mulq $0, $22, $18 + umulh $0, $22, $9 + addq $17, $18, $17 + cmpult $17, $18, $10 + addq $10, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $18 + addq $20, $18, $20 + mulq $2, $7, $10 + umulh $2, $7, $9 + addq $17, $10, $17 + cmpult $17, $10, $18 + addq $18, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $10 + addq $20, $10, $20 + mulq $4, $5, $18 + umulh $4, $5, $9 + addq $17, $18, $17 + cmpult $17, $18, $10 + addq $10, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $18 + addq $20, $18, $20 + mulq $6, $3, $10 + umulh $6, $3, $9 + addq $17, $10, $17 + cmpult $17, $10, $18 + addq $18, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $10 + addq $20, $10, $20 + mulq $8, $1, $18 + umulh $8, $1, $9 + addq $17, $18, $17 + cmpult $17, $18, $10 + addq $10, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $18 + addq $20, $18, $20 + stq $17, 32($16) + bis $31, $31, $10 + mulq $0, $24, $9 + umulh $0, $24, $18 + addq $19, $9, $19 + cmpult $19, $9, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $9 + addq $10, $9, $10 + mulq $2, $22, $17 + umulh $2, $22, $18 + addq $19, $17, $19 + cmpult $19, $17, $9 + addq $9, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $10, $17, $10 + mulq $4, $7, $9 + umulh $4, $7, $18 + addq $19, $9, $19 + cmpult $19, $9, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $9 + addq $10, $9, $10 + mulq $6, $5, $17 + umulh $6, $5, $18 + addq $19, $17, $19 + cmpult $19, $17, $9 + addq $9, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $10, $17, $10 + mulq $8, $3, $9 + umulh $8, $3, $18 + addq $19, $9, $19 + cmpult $19, $9, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $9 + addq $10, $9, $10 + mulq $23, $1, $17 + umulh $23, $1, $18 + addq $19, $17, $19 + cmpult $19, $17, $9 + addq $9, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $10, $17, $10 + stq $19, 40($16) + bis $31, $31, $9 + mulq $0, $27, $18 + umulh $0, $27, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + mulq $2, $24, $19 + umulh $2, $24, $17 + addq $20, $19, $20 + cmpult $20, $19, $18 + addq $18, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $9, $19, $9 + mulq $4, $22, $18 + umulh $4, $22, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + mulq $6, $7, $19 + umulh $6, $7, $17 + addq $20, $19, $20 + cmpult $20, $19, $18 + addq $18, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $9, $19, $9 + mulq $8, $5, $18 + umulh $8, $5, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + mulq $23, $3, $19 + umulh $23, $3, $17 + addq $20, $19, $20 + cmpult $20, $19, $18 + addq $18, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $9, $19, $9 + mulq $25, $1, $18 + umulh $25, $1, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + stq $20, 48($16) + bis $31, $31, $19 + mulq $0, $21, $17 + umulh $0, $21, $18 + addq $10, $17, $10 + cmpult $10, $17, $20 + addq $20, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $17 + addq $19, $17, $19 + mulq $2, $27, $20 + umulh $2, $27, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + mulq $4, $24, $20 + umulh $4, $24, $17 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $19, $0, $19 + mulq $6, $22, $20 + umulh $6, $22, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + mulq $8, $7, $20 + umulh $8, $7, $17 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $19, $0, $19 + mulq $23, $5, $20 + umulh $23, $5, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + mulq $25, $3, $20 + umulh $25, $3, $17 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $19, $0, $19 + mulq $28, $1, $20 + umulh $28, $1, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + stq $10, 56($16) + bis $31, $31, $20 + mulq $2, $21, $17 + umulh $2, $21, $18 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $0, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $1 + addq $20, $1, $20 + mulq $4, $27, $10 + umulh $4, $27, $17 + addq $9, $10, $9 + cmpult $9, $10, $0 + addq $0, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $20, $18, $20 + mulq $6, $24, $1 + umulh $6, $24, $2 + addq $9, $1, $9 + cmpult $9, $1, $10 + addq $10, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $0 + addq $20, $0, $20 + mulq $8, $22, $17 + umulh $8, $22, $18 + addq $9, $17, $9 + cmpult $9, $17, $1 + addq $1, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $20, $10, $20 + mulq $23, $7, $2 + umulh $23, $7, $0 + addq $9, $2, $9 + cmpult $9, $2, $17 + addq $17, $0, $0 + addq $19, $0, $19 + cmpult $19, $0, $1 + addq $20, $1, $20 + mulq $25, $5, $18 + umulh $25, $5, $10 + addq $9, $18, $9 + cmpult $9, $18, $2 + addq $2, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $20, $17, $20 + mulq $28, $3, $0 + umulh $28, $3, $1 + addq $9, $0, $9 + cmpult $9, $0, $18 + addq $18, $1, $1 + addq $19, $1, $19 + cmpult $19, $1, $2 + addq $20, $2, $20 + stq $9, 64($16) + bis $31, $31, $10 + mulq $4, $21, $17 + umulh $4, $21, $0 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $18, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $1 + addq $10, $1, $10 + mulq $6, $27, $2 + umulh $6, $27, $3 + addq $19, $2, $19 + cmpult $19, $2, $9 + addq $9, $3, $3 + addq $20, $3, $20 + cmpult $20, $3, $17 + addq $10, $17, $10 + mulq $8, $24, $18 + umulh $8, $24, $0 + addq $19, $18, $19 + cmpult $19, $18, $1 + addq $1, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $4 + addq $10, $4, $10 + mulq $23, $22, $2 + umulh $23, $22, $9 + addq $19, $2, $19 + cmpult $19, $2, $3 + addq $3, $9, $9 + addq $20, $9, $20 + cmpult $20, $9, $17 + addq $10, $17, $10 + mulq $25, $7, $18 + umulh $25, $7, $1 + addq $19, $18, $19 + cmpult $19, $18, $0 + addq $0, $1, $1 + addq $20, $1, $20 + cmpult $20, $1, $4 + addq $10, $4, $10 + mulq $28, $5, $2 + umulh $28, $5, $3 + addq $19, $2, $19 + cmpult $19, $2, $9 + addq $9, $3, $3 + addq $20, $3, $20 + cmpult $20, $3, $17 + addq $10, $17, $10 + stq $19, 72($16) + bis $31, $31, $18 + mulq $6, $21, $0 + umulh $6, $21, $1 + addq $20, $0, $20 + cmpult $20, $0, $4 + addq $4, $1, $1 + addq $10, $1, $10 + cmpult $10, $1, $2 + addq $18, $2, $18 + mulq $8, $27, $9 + umulh $8, $27, $3 + addq $20, $9, $20 + cmpult $20, $9, $17 + addq $17, $3, $3 + addq $10, $3, $10 + cmpult $10, $3, $5 + addq $18, $5, $18 + mulq $23, $24, $19 + umulh $23, $24, $0 + addq $20, $19, $20 + cmpult $20, $19, $4 + addq $4, $0, $0 + addq $10, $0, $10 + cmpult $10, $0, $1 + addq $18, $1, $18 + mulq $25, $22, $2 + umulh $25, $22, $6 + addq $20, $2, $20 + cmpult $20, $2, $9 + addq $9, $6, $6 + addq $10, $6, $10 + cmpult $10, $6, $17 + addq $18, $17, $18 + mulq $28, $7, $3 + umulh $28, $7, $5 + addq $20, $3, $20 + cmpult $20, $3, $19 + addq $19, $5, $5 + addq $10, $5, $10 + cmpult $10, $5, $4 + addq $18, $4, $18 + stq $20, 80($16) + bis $31, $31, $0 + mulq $8, $21, $1 + umulh $8, $21, $2 + addq $10, $1, $10 + cmpult $10, $1, $9 + addq $9, $2, $2 + addq $18, $2, $18 + cmpult $18, $2, $6 + addq $0, $6, $0 + mulq $23, $27, $17 + umulh $23, $27, $3 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $19, $3, $3 + addq $18, $3, $18 + cmpult $18, $3, $5 + addq $0, $5, $0 + mulq $25, $24, $4 + umulh $25, $24, $7 + addq $10, $4, $10 + cmpult $10, $4, $20 + addq $20, $7, $7 + addq $18, $7, $18 + cmpult $18, $7, $1 + addq $0, $1, $0 + mulq $28, $22, $9 + umulh $28, $22, $2 + addq $10, $9, $10 + cmpult $10, $9, $6 + addq $6, $2, $2 + addq $18, $2, $18 + cmpult $18, $2, $8 + addq $0, $8, $0 + stq $10, 88($16) + bis $31, $31, $17 + mulq $23, $21, $19 + umulh $23, $21, $3 + addq $18, $19, $18 + cmpult $18, $19, $5 + addq $5, $3, $3 + addq $0, $3, $0 + cmpult $0, $3, $4 + addq $17, $4, $17 + mulq $25, $27, $20 + umulh $25, $27, $7 + addq $18, $20, $18 + cmpult $18, $20, $1 + addq $1, $7, $7 + addq $0, $7, $0 + cmpult $0, $7, $9 + addq $17, $9, $17 + mulq $28, $24, $6 + umulh $28, $24, $2 + addq $18, $6, $18 + cmpult $18, $6, $8 + addq $8, $2, $2 + addq $0, $2, $0 + cmpult $0, $2, $22 + addq $17, $22, $17 + stq $18, 96($16) + bis $31, $31, $10 + mulq $25, $21, $19 + umulh $25, $21, $5 + addq $0, $19, $0 + cmpult $0, $19, $3 + addq $3, $5, $5 + addq $17, $5, $17 + cmpult $17, $5, $4 + addq $10, $4, $10 + mulq $28, $27, $23 + umulh $28, $27, $20 + addq $0, $23, $0 + cmpult $0, $23, $1 + addq $1, $20, $20 + addq $17, $20, $17 + cmpult $17, $20, $7 + addq $10, $7, $10 + stq $0, 104($16) + bis $31, $31, $9 + mulq $28, $21, $6 + umulh $28, $21, $8 + addq $17, $6, $17 + cmpult $17, $6, $2 + addq $2, $8, $8 + addq $10, $8, $10 + cmpult $10, $8, $22 + addq $9, $22, $9 + stq $17, 112($16) + stq $10, 120($16) + ldq $9, 0($30) + ldq $10, 8($30) + addq $30, 16, $30 + ret $31,($26),1 + .end bn_mul_comba8 + .text + .align 3 + .globl bn_mul_comba4 + .ent bn_mul_comba4 +bn_mul_comba4: +bn_mul_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + mulq $0, $1, $4 + ldq $5, 16($17) + ldq $6, 16($18) + umulh $0, $1, $7 + ldq $8, 24($17) + ldq $22, 24($18) + mulq $0, $3, $23 + stq $4, 0($16) + bis $31, $31, $24 + mulq $2, $1, $28 + bis $31, $31, $25 + bis $31, $31, + addq $24, $7, $24 + umulh $0, $3, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $2, $1, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $27, $17, $27 + mulq $0, $6, $18 + cmpult $24, $28, $4 + addq $25, $4, $25 + stq $24, 8($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $21, $24 + bis $31, $31, $27 + mulq $2, $3, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + mulq $5, $1, $23 + cmpult $24, $19, $17 + addq $24, $7, $24 + addq $27, $17, $27 + umulh $0, $6, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + umulh $2, $3, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $27, $20, $27 + umulh $5, $1, $19 + cmpult $24, $23, $17 + addq $25, $17, $25 + stq $24, 16($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $28, $24 + bis $31, $31, $27 + mulq $0, $22, $18 + cmpult $24, $28, $4 + addq $24, $21, $24 + addq $25, $4, $25 + mulq $2, $6, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + mulq $5, $3, $23 + cmpult $24, $19, $17 + addq $24, $18, $24 + addq $25, $17, $25 + mulq $8, $1, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + umulh $0, $22, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $2, $6, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $25, $17, $25 + umulh $5, $3, $18 + cmpult $24, $28, $4 + addq $25, $4, $25 + stq $24, 24($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $21, $24 + bis $31, $31, $27 + umulh $8, $1, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + mulq $2, $22, $23 + cmpult $24, $19, $17 + addq $24, $18, $24 + addq $25, $17, $25 + mulq $5, $6, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + mulq $8, $3, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $2, $22, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $25, $17, $25 + umulh $5, $6, $18 + cmpult $24, $28, $4 + addq $24, $21, $24 + addq $25, $4, $25 + umulh $8, $3, $7 + cmpult $24, $21, $20 + addq $25, $20, $25 + stq $24, 32($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $19, $24 + bis $31, $31, $27 + mulq $5, $22, $23 + cmpult $24, $19, $17 + addq $24, $18, $24 + addq $25, $17, $25 + mulq $8, $6, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + umulh $5, $22, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $8, $6, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $25, $17, $25 + mulq $8, $22, $18 + cmpult $24, $28, $4 + addq $25, $4, $25 + stq $24, 40($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $21, $24 + bis $31, $31, $27 + umulh $8, $22, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + cmpult $24, $19, $23 + addq $24, $18, $24 + addq $25, $23, $25 + cmpult $24, $18, $17 + addq $25, $17, $25 + stq $24, 48($16) + addq $25, $27, $24 + addq $24, $7, $24 + stq $24, 56($16) + ret $31,($26),1 + .end bn_mul_comba4 + .text + .align 3 + .globl bn_sqr_comba4 + .ent bn_sqr_comba4 +bn_sqr_comba4: +bn_sqr_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + bis $31, $31, $6 + mulq $0, $0, $4 + umulh $0, $0, $5 + stq $4, 0($16) + bis $31, $31, $4 + mulq $0, $1, $7 + umulh $0, $1, $8 + cmplt $7, $31, $22 + cmplt $8, $31, $23 + addq $7, $7, $7 + addq $8, $8, $8 + addq $8, $22, $8 + addq $4, $23, $4 + addq $5, $7, $5 + addq $6, $8, $6 + cmpult $5, $7, $24 + cmpult $6, $8, $25 + addq $6, $24, $6 + addq $4, $25, $4 + stq $5, 8($16) + bis $31, $31, $5 + mulq $1, $1, $27 + umulh $1, $1, $28 + addq $6, $27, $6 + addq $4, $28, $4 + cmpult $6, $27, $21 + cmpult $4, $28, $20 + addq $4, $21, $4 + addq $5, $20, $5 + mulq $2, $0, $19 + umulh $2, $0, $18 + cmplt $19, $31, $17 + cmplt $18, $31, $22 + addq $19, $19, $19 + addq $18, $18, $18 + addq $18, $17, $18 + addq $5, $22, $5 + addq $6, $19, $6 + addq $4, $18, $4 + cmpult $6, $19, $23 + cmpult $4, $18, $7 + addq $4, $23, $4 + addq $5, $7, $5 + stq $6, 16($16) + bis $31, $31, $6 + mulq $3, $0, $8 + umulh $3, $0, $24 + cmplt $8, $31, $25 + cmplt $24, $31, $27 + addq $8, $8, $8 + addq $24, $24, $24 + addq $24, $25, $24 + addq $6, $27, $6 + addq $4, $8, $4 + addq $5, $24, $5 + cmpult $4, $8, $28 + cmpult $5, $24, $21 + addq $5, $28, $5 + addq $6, $21, $6 + mulq $2, $1, $20 + umulh $2, $1, $17 + cmplt $20, $31, $22 + cmplt $17, $31, $19 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $22, $17 + addq $6, $19, $6 + addq $4, $20, $4 + addq $5, $17, $5 + cmpult $4, $20, $18 + cmpult $5, $17, $23 + addq $5, $18, $5 + addq $6, $23, $6 + stq $4, 24($16) + bis $31, $31, $4 + mulq $2, $2, $7 + umulh $2, $2, $25 + addq $5, $7, $5 + addq $6, $25, $6 + cmpult $5, $7, $27 + cmpult $6, $25, $8 + addq $6, $27, $6 + addq $4, $8, $4 + mulq $3, $1, $24 + umulh $3, $1, $28 + cmplt $24, $31, $21 + cmplt $28, $31, $22 + addq $24, $24, $24 + addq $28, $28, $28 + addq $28, $21, $28 + addq $4, $22, $4 + addq $5, $24, $5 + addq $6, $28, $6 + cmpult $5, $24, $19 + cmpult $6, $28, $20 + addq $6, $19, $6 + addq $4, $20, $4 + stq $5, 32($16) + bis $31, $31, $5 + mulq $3, $2, $17 + umulh $3, $2, $18 + cmplt $17, $31, $23 + cmplt $18, $31, $7 + addq $17, $17, $17 + addq $18, $18, $18 + addq $18, $23, $18 + addq $5, $7, $5 + addq $6, $17, $6 + addq $4, $18, $4 + cmpult $6, $17, $25 + cmpult $4, $18, $27 + addq $4, $25, $4 + addq $5, $27, $5 + stq $6, 40($16) + bis $31, $31, $6 + mulq $3, $3, $8 + umulh $3, $3, $21 + addq $4, $8, $4 + addq $5, $21, $5 + cmpult $4, $8, $22 + cmpult $5, $21, $24 + addq $5, $22, $5 + addq $6, $24, $6 + stq $4, 48($16) + stq $5, 56($16) + ret $31,($26),1 + .end bn_sqr_comba4 + .text + .align 3 + .globl bn_sqr_comba8 + .ent bn_sqr_comba8 +bn_sqr_comba8: +bn_sqr_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + ldq $4, 32($17) + ldq $5, 40($17) + ldq $6, 48($17) + ldq $7, 56($17) + bis $31, $31, $23 + mulq $0, $0, $8 + umulh $0, $0, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $1, $0, $24 + umulh $1, $0, $25 + cmplt $24, $31, $27 + cmplt $25, $31, $28 + addq $24, $24, $24 + addq $25, $25, $25 + addq $25, $27, $25 + addq $8, $28, $8 + addq $22, $24, $22 + addq $23, $25, $23 + cmpult $22, $24, $21 + cmpult $23, $25, $20 + addq $23, $21, $23 + addq $8, $20, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $1, $1, $19 + umulh $1, $1, $18 + addq $23, $19, $23 + addq $8, $18, $8 + cmpult $23, $19, $17 + cmpult $8, $18, $27 + addq $8, $17, $8 + addq $22, $27, $22 + mulq $2, $0, $28 + umulh $2, $0, $24 + cmplt $28, $31, $25 + cmplt $24, $31, $21 + addq $28, $28, $28 + addq $24, $24, $24 + addq $24, $25, $24 + addq $22, $21, $22 + addq $23, $28, $23 + addq $8, $24, $8 + cmpult $23, $28, $20 + cmpult $8, $24, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $2, $1, $18 + umulh $2, $1, $17 + cmplt $18, $31, $27 + cmplt $17, $31, $25 + addq $18, $18, $18 + addq $17, $17, $17 + addq $17, $27, $17 + addq $23, $25, $23 + addq $8, $18, $8 + addq $22, $17, $22 + cmpult $8, $18, $21 + cmpult $22, $17, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $3, $0, $24 + umulh $3, $0, $20 + cmplt $24, $31, $19 + cmplt $20, $31, $27 + addq $24, $24, $24 + addq $20, $20, $20 + addq $20, $19, $20 + addq $23, $27, $23 + addq $8, $24, $8 + addq $22, $20, $22 + cmpult $8, $24, $25 + cmpult $22, $20, $18 + addq $22, $25, $22 + addq $23, $18, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $2, $17 + umulh $2, $2, $21 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $28 + cmpult $23, $21, $19 + addq $23, $28, $23 + addq $8, $19, $8 + mulq $3, $1, $27 + umulh $3, $1, $24 + cmplt $27, $31, $20 + cmplt $24, $31, $25 + addq $27, $27, $27 + addq $24, $24, $24 + addq $24, $20, $24 + addq $8, $25, $8 + addq $22, $27, $22 + addq $23, $24, $23 + cmpult $22, $27, $18 + cmpult $23, $24, $17 + addq $23, $18, $23 + addq $8, $17, $8 + mulq $4, $0, $21 + umulh $4, $0, $28 + cmplt $21, $31, $19 + cmplt $28, $31, $20 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $19, $28 + addq $8, $20, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $25 + cmpult $23, $28, $27 + addq $23, $25, $23 + addq $8, $27, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $3, $2, $24 + umulh $3, $2, $18 + cmplt $24, $31, $17 + cmplt $18, $31, $19 + addq $24, $24, $24 + addq $18, $18, $18 + addq $18, $17, $18 + addq $22, $19, $22 + addq $23, $24, $23 + addq $8, $18, $8 + cmpult $23, $24, $20 + cmpult $8, $18, $21 + addq $8, $20, $8 + addq $22, $21, $22 + mulq $4, $1, $28 + umulh $4, $1, $25 + cmplt $28, $31, $27 + cmplt $25, $31, $17 + addq $28, $28, $28 + addq $25, $25, $25 + addq $25, $27, $25 + addq $22, $17, $22 + addq $23, $28, $23 + addq $8, $25, $8 + cmpult $23, $28, $19 + cmpult $8, $25, $24 + addq $8, $19, $8 + addq $22, $24, $22 + mulq $5, $0, $18 + umulh $5, $0, $20 + cmplt $18, $31, $21 + cmplt $20, $31, $27 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $21, $20 + addq $22, $27, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $28 + addq $8, $17, $8 + addq $22, $28, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $3, $3, $25 + umulh $3, $3, $19 + addq $8, $25, $8 + addq $22, $19, $22 + cmpult $8, $25, $24 + cmpult $22, $19, $21 + addq $22, $24, $22 + addq $23, $21, $23 + mulq $4, $2, $27 + umulh $4, $2, $18 + cmplt $27, $31, $20 + cmplt $18, $31, $17 + addq $27, $27, $27 + addq $18, $18, $18 + addq $18, $20, $18 + addq $23, $17, $23 + addq $8, $27, $8 + addq $22, $18, $22 + cmpult $8, $27, $28 + cmpult $22, $18, $25 + addq $22, $28, $22 + addq $23, $25, $23 + mulq $5, $1, $19 + umulh $5, $1, $24 + cmplt $19, $31, $21 + cmplt $24, $31, $20 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $21, $24 + addq $23, $20, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $17 + cmpult $22, $24, $27 + addq $22, $17, $22 + addq $23, $27, $23 + mulq $6, $0, $18 + umulh $6, $0, $28 + cmplt $18, $31, $25 + cmplt $28, $31, $21 + addq $18, $18, $18 + addq $28, $28, $28 + addq $28, $25, $28 + addq $23, $21, $23 + addq $8, $18, $8 + addq $22, $28, $22 + cmpult $8, $18, $20 + cmpult $22, $28, $19 + addq $22, $20, $22 + addq $23, $19, $23 + stq $8, 48($16) + bis $31, $31, $8 + mulq $4, $3, $24 + umulh $4, $3, $17 + cmplt $24, $31, $27 + cmplt $17, $31, $25 + addq $24, $24, $24 + addq $17, $17, $17 + addq $17, $27, $17 + addq $8, $25, $8 + addq $22, $24, $22 + addq $23, $17, $23 + cmpult $22, $24, $21 + cmpult $23, $17, $18 + addq $23, $21, $23 + addq $8, $18, $8 + mulq $5, $2, $28 + umulh $5, $2, $20 + cmplt $28, $31, $19 + cmplt $20, $31, $27 + addq $28, $28, $28 + addq $20, $20, $20 + addq $20, $19, $20 + addq $8, $27, $8 + addq $22, $28, $22 + addq $23, $20, $23 + cmpult $22, $28, $25 + cmpult $23, $20, $24 + addq $23, $25, $23 + addq $8, $24, $8 + mulq $6, $1, $17 + umulh $6, $1, $21 + cmplt $17, $31, $18 + cmplt $21, $31, $19 + addq $17, $17, $17 + addq $21, $21, $21 + addq $21, $18, $21 + addq $8, $19, $8 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $27 + cmpult $23, $21, $28 + addq $23, $27, $23 + addq $8, $28, $8 + mulq $7, $0, $20 + umulh $7, $0, $25 + cmplt $20, $31, $24 + cmplt $25, $31, $18 + addq $20, $20, $20 + addq $25, $25, $25 + addq $25, $24, $25 + addq $8, $18, $8 + addq $22, $20, $22 + addq $23, $25, $23 + cmpult $22, $20, $19 + cmpult $23, $25, $17 + addq $23, $19, $23 + addq $8, $17, $8 + stq $22, 56($16) + bis $31, $31, $22 + mulq $4, $4, $21 + umulh $4, $4, $27 + addq $23, $21, $23 + addq $8, $27, $8 + cmpult $23, $21, $28 + cmpult $8, $27, $24 + addq $8, $28, $8 + addq $22, $24, $22 + mulq $5, $3, $18 + umulh $5, $3, $20 + cmplt $18, $31, $25 + cmplt $20, $31, $19 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $25, $20 + addq $22, $19, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $21 + addq $8, $17, $8 + addq $22, $21, $22 + mulq $6, $2, $27 + umulh $6, $2, $28 + cmplt $27, $31, $24 + cmplt $28, $31, $25 + addq $27, $27, $27 + addq $28, $28, $28 + addq $28, $24, $28 + addq $22, $25, $22 + addq $23, $27, $23 + addq $8, $28, $8 + cmpult $23, $27, $19 + cmpult $8, $28, $18 + addq $8, $19, $8 + addq $22, $18, $22 + mulq $7, $1, $20 + umulh $7, $1, $17 + cmplt $20, $31, $21 + cmplt $17, $31, $24 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $21, $17 + addq $22, $24, $22 + addq $23, $20, $23 + addq $8, $17, $8 + cmpult $23, $20, $25 + cmpult $8, $17, $27 + addq $8, $25, $8 + addq $22, $27, $22 + stq $23, 64($16) + bis $31, $31, $23 + mulq $5, $4, $28 + umulh $5, $4, $19 + cmplt $28, $31, $18 + cmplt $19, $31, $21 + addq $28, $28, $28 + addq $19, $19, $19 + addq $19, $18, $19 + addq $23, $21, $23 + addq $8, $28, $8 + addq $22, $19, $22 + cmpult $8, $28, $24 + cmpult $22, $19, $20 + addq $22, $24, $22 + addq $23, $20, $23 + mulq $6, $3, $17 + umulh $6, $3, $25 + cmplt $17, $31, $27 + cmplt $25, $31, $18 + addq $17, $17, $17 + addq $25, $25, $25 + addq $25, $27, $25 + addq $23, $18, $23 + addq $8, $17, $8 + addq $22, $25, $22 + cmpult $8, $17, $21 + cmpult $22, $25, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $7, $2, $19 + umulh $7, $2, $24 + cmplt $19, $31, $20 + cmplt $24, $31, $27 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $20, $24 + addq $23, $27, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $18 + cmpult $22, $24, $17 + addq $22, $18, $22 + addq $23, $17, $23 + stq $8, 72($16) + bis $31, $31, $8 + mulq $5, $5, $25 + umulh $5, $5, $21 + addq $22, $25, $22 + addq $23, $21, $23 + cmpult $22, $25, $28 + cmpult $23, $21, $20 + addq $23, $28, $23 + addq $8, $20, $8 + mulq $6, $4, $27 + umulh $6, $4, $19 + cmplt $27, $31, $24 + cmplt $19, $31, $18 + addq $27, $27, $27 + addq $19, $19, $19 + addq $19, $24, $19 + addq $8, $18, $8 + addq $22, $27, $22 + addq $23, $19, $23 + cmpult $22, $27, $17 + cmpult $23, $19, $25 + addq $23, $17, $23 + addq $8, $25, $8 + mulq $7, $3, $21 + umulh $7, $3, $28 + cmplt $21, $31, $20 + cmplt $28, $31, $24 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $20, $28 + addq $8, $24, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $18 + cmpult $23, $28, $27 + addq $23, $18, $23 + addq $8, $27, $8 + stq $22, 80($16) + bis $31, $31, $22 + mulq $6, $5, $19 + umulh $6, $5, $17 + cmplt $19, $31, $25 + cmplt $17, $31, $20 + addq $19, $19, $19 + addq $17, $17, $17 + addq $17, $25, $17 + addq $22, $20, $22 + addq $23, $19, $23 + addq $8, $17, $8 + cmpult $23, $19, $24 + cmpult $8, $17, $21 + addq $8, $24, $8 + addq $22, $21, $22 + mulq $7, $4, $28 + umulh $7, $4, $18 + cmplt $28, $31, $27 + cmplt $18, $31, $25 + addq $28, $28, $28 + addq $18, $18, $18 + addq $18, $27, $18 + addq $22, $25, $22 + addq $23, $28, $23 + addq $8, $18, $8 + cmpult $23, $28, $20 + cmpult $8, $18, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 88($16) + bis $31, $31, $23 + mulq $6, $6, $17 + umulh $6, $6, $24 + addq $8, $17, $8 + addq $22, $24, $22 + cmpult $8, $17, $21 + cmpult $22, $24, $27 + addq $22, $21, $22 + addq $23, $27, $23 + mulq $7, $5, $25 + umulh $7, $5, $28 + cmplt $25, $31, $18 + cmplt $28, $31, $20 + addq $25, $25, $25 + addq $28, $28, $28 + addq $28, $18, $28 + addq $23, $20, $23 + addq $8, $25, $8 + addq $22, $28, $22 + cmpult $8, $25, $19 + cmpult $22, $28, $17 + addq $22, $19, $22 + addq $23, $17, $23 + stq $8, 96($16) + bis $31, $31, $8 + mulq $7, $6, $24 + umulh $7, $6, $21 + cmplt $24, $31, $27 + cmplt $21, $31, $18 + addq $24, $24, $24 + addq $21, $21, $21 + addq $21, $27, $21 + addq $8, $18, $8 + addq $22, $24, $22 + addq $23, $21, $23 + cmpult $22, $24, $20 + cmpult $23, $21, $25 + addq $23, $20, $23 + addq $8, $25, $8 + stq $22, 104($16) + bis $31, $31, $22 + mulq $7, $7, $28 + umulh $7, $7, $19 + addq $23, $28, $23 + addq $8, $19, $8 + cmpult $23, $28, $17 + cmpult $8, $19, $27 + addq $8, $17, $8 + addq $22, $27, $22 + stq $23, 112($16) + stq $8, 120($16) + ret $31,($26),1 + .end bn_sqr_comba8 diff --git a/crypto/bn/asm/ff b/crypto/bn/asm/ff new file mode 100644 index 0000000000..4af216889d --- /dev/null +++ b/crypto/bn/asm/ff @@ -0,0 +1,724 @@ + .text + .align 3 + .globl bn_mul_comba4 + .ent bn_mul_comba4 +bn_mul_comba4: +bn_mul_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + bis $31, $31, $23 + mulq $0, $1, $8 + umulh $0, $1, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $0, $3, $24 + umulh $0, $3, $25 + addq $22, $24, $22 + cmpult $22, $24, $27 + addq $27, $25, $25 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $8, $28, $8 + mulq $2, $1, $21 + umulh $2, $1, $20 + addq $22, $21, $22 + cmpult $22, $21, $19 + addq $19, $20, $20 + addq $23, $20, $23 + cmpult $23, $20, $17 + addq $8, $17, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $2, $3, $18 + umulh $2, $3, $24 + addq $23, $18, $23 + cmpult $23, $18, $27 + addq $27, $24, $24 + addq $8, $24, $8 + cmpult $8, $24, $25 + addq $22, $25, $22 + mulq $0, $5, $28 + umulh $0, $5, $21 + addq $23, $28, $23 + cmpult $23, $28, $19 + addq $19, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + mulq $4, $1, $17 + umulh $4, $1, $18 + addq $23, $17, $23 + cmpult $23, $17, $27 + addq $27, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $24 + addq $22, $24, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $0, $7, $25 + umulh $0, $7, $28 + addq $8, $25, $8 + cmpult $8, $25, $19 + addq $19, $28, $28 + addq $22, $28, $22 + cmpult $22, $28, $21 + addq $23, $21, $23 + mulq $2, $5, $20 + umulh $2, $5, $17 + addq $8, $20, $8 + cmpult $8, $20, $27 + addq $27, $17, $17 + addq $22, $17, $22 + cmpult $22, $17, $18 + addq $23, $18, $23 + mulq $4, $3, $24 + umulh $4, $3, $25 + addq $8, $24, $8 + cmpult $8, $24, $19 + addq $19, $25, $25 + addq $22, $25, $22 + cmpult $22, $25, $28 + addq $23, $28, $23 + mulq $6, $1, $21 + umulh $6, $1, $0 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $20, $0, $0 + addq $22, $0, $22 + cmpult $22, $0, $27 + addq $23, $27, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $7, $17 + umulh $2, $7, $18 + addq $22, $17, $22 + cmpult $22, $17, $24 + addq $24, $18, $18 + addq $23, $18, $23 + cmpult $23, $18, $19 + addq $8, $19, $8 + mulq $4, $5, $25 + umulh $4, $5, $28 + addq $22, $25, $22 + cmpult $22, $25, $21 + addq $21, $28, $28 + addq $23, $28, $23 + cmpult $23, $28, $20 + addq $8, $20, $8 + mulq $6, $3, $0 + umulh $6, $3, $27 + addq $22, $0, $22 + cmpult $22, $0, $1 + addq $1, $27, $27 + addq $23, $27, $23 + cmpult $23, $27, $17 + addq $8, $17, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $4, $7, $24 + umulh $4, $7, $18 + addq $23, $24, $23 + cmpult $23, $24, $19 + addq $19, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $2 + addq $22, $2, $22 + mulq $6, $5, $25 + umulh $6, $5, $21 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $28, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $6, $7, $0 + umulh $6, $7, $1 + addq $8, $0, $8 + cmpult $8, $0, $27 + addq $27, $1, $1 + addq $22, $1, $22 + cmpult $22, $1, $17 + addq $23, $17, $23 + stq $8, 48($16) + stq $22, 56($16) + ret $31,($26),1 + .end bn_mul_comba4 + .text + .align 3 + .globl bn_mul_comba8 + .ent bn_mul_comba8 +bn_mul_comba8: +bn_mul_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + stq $9, 8($30) + stq $10, 16($30) + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + ldq $8, 8($17) + ldq $22, 8($18) + ldq $23, 8($17) + ldq $24, 8($18) + ldq $25, 8($17) + ldq $27, 8($18) + ldq $28, 8($17) + ldq $21, 8($18) + bis $31, $31, $9 + mulq $0, $1, $20 + umulh $0, $1, $19 + stq $20, 0($16) + bis $31, $31, $20 + mulq $0, $3, $10 + umulh $0, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $2, $1, $18 + umulh $2, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 8($16) + bis $31, $31, $19 + mulq $0, $5, $10 + umulh $0, $5, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $3, $18 + umulh $2, $3, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $1, $10 + umulh $4, $1, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 16($16) + bis $31, $31, $9 + mulq $0, $7, $18 + umulh $0, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $2, $5, $10 + umulh $2, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $4, $3, $18 + umulh $4, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $6, $1, $10 + umulh $6, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 24($16) + bis $31, $31, $20 + mulq $0, $22, $18 + umulh $0, $22, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $7, $10 + umulh $2, $7, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $4, $5, $18 + umulh $4, $5, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $6, $3, $10 + umulh $6, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $8, $1, $18 + umulh $8, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 32($16) + bis $31, $31, $19 + mulq $0, $24, $10 + umulh $0, $24, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $22, $18 + umulh $2, $22, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $7, $10 + umulh $4, $7, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $6, $5, $18 + umulh $6, $5, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $8, $3, $10 + umulh $8, $3, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $23, $1, $18 + umulh $23, $1, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + stq $9, 40($16) + bis $31, $31, $9 + mulq $0, $27, $10 + umulh $0, $27, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $2, $24, $18 + umulh $2, $24, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $4, $22, $10 + umulh $4, $22, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $6, $7, $18 + umulh $6, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $8, $5, $10 + umulh $8, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $23, $3, $18 + umulh $23, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $25, $1, $10 + umulh $25, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 48($16) + bis $31, $31, $20 + mulq $0, $21, $18 + umulh $0, $21, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $27, $10 + umulh $2, $27, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $4, $24, $10 + umulh $4, $24, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $6, $22, $10 + umulh $6, $22, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $8, $7, $10 + umulh $8, $7, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $23, $5, $10 + umulh $23, $5, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $25, $3, $10 + umulh $25, $3, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $28, $1, $10 + umulh $28, $1, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + stq $19, 56($16) + bis $31, $31, $19 + mulq $2, $21, $10 + umulh $2, $21, $18 + addq $9, $10, $9 + cmpult $9, $10, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $0 + addq $19, $0, $19 + mulq $4, $27, $1 + umulh $4, $27, $10 + addq $9, $1, $9 + cmpult $9, $1, $17 + addq $17, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $19, $18, $19 + mulq $6, $24, $0 + umulh $6, $24, $2 + addq $9, $0, $9 + cmpult $9, $0, $1 + addq $1, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $17 + addq $19, $17, $19 + mulq $8, $22, $10 + umulh $8, $22, $18 + addq $9, $10, $9 + cmpult $9, $10, $0 + addq $0, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $1 + addq $19, $1, $19 + mulq $23, $7, $2 + umulh $23, $7, $17 + addq $9, $2, $9 + cmpult $9, $2, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $0 + addq $19, $0, $19 + mulq $25, $5, $18 + umulh $25, $5, $1 + addq $9, $18, $9 + cmpult $9, $18, $2 + addq $2, $1, $1 + addq $20, $1, $20 + cmpult $20, $1, $10 + addq $19, $10, $19 + mulq $28, $3, $17 + umulh $28, $3, $0 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $18, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $19, $2, $19 + stq $9, 64($16) + bis $31, $31, $9 + mulq $4, $21, $1 + umulh $4, $21, $10 + addq $20, $1, $20 + cmpult $20, $1, $17 + addq $17, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $9, $18, $9 + mulq $6, $27, $0 + umulh $6, $27, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + mulq $8, $24, $17 + umulh $8, $24, $10 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $18, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $4 + addq $9, $4, $9 + mulq $23, $22, $0 + umulh $23, $22, $3 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $1 + addq $9, $1, $9 + mulq $25, $7, $17 + umulh $25, $7, $18 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $10, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $4 + addq $9, $4, $9 + mulq $28, $5, $0 + umulh $28, $5, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + stq $20, 72($16) + bis $31, $31, $20 + mulq $6, $21, $17 + umulh $6, $21, $10 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $18, $10, $10 + addq $9, $10, $9 + cmpult $9, $10, $4 + addq $20, $4, $20 + mulq $8, $27, $0 + umulh $8, $27, $3 + addq $19, $0, $19 + cmpult $19, $0, $2 + addq $2, $3, $3 + addq $9, $3, $9 + cmpult $9, $3, $1 + addq $20, $1, $20 + mulq $23, $24, $5 + umulh $23, $24, $17 + addq $19, $5, $19 + cmpult $19, $5, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $25, $22, $4 + umulh $25, $22, $6 + addq $19, $4, $19 + cmpult $19, $4, $0 + addq $0, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $2 + addq $20, $2, $20 + mulq $28, $7, $3 + umulh $28, $7, $1 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $5, $1, $1 + addq $9, $1, $9 + cmpult $9, $1, $18 + addq $20, $18, $20 + stq $19, 80($16) + bis $31, $31, $19 + mulq $8, $21, $17 + umulh $8, $21, $10 + addq $9, $17, $9 + cmpult $9, $17, $4 + addq $4, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $0 + addq $19, $0, $19 + mulq $23, $27, $6 + umulh $23, $27, $2 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $3, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $5 + addq $19, $5, $19 + mulq $25, $24, $1 + umulh $25, $24, $18 + addq $9, $1, $9 + cmpult $9, $1, $7 + addq $7, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $19, $17, $19 + mulq $28, $22, $4 + umulh $28, $22, $10 + addq $9, $4, $9 + cmpult $9, $4, $0 + addq $0, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $8 + addq $19, $8, $19 + stq $9, 88($16) + bis $31, $31, $9 + mulq $23, $21, $6 + umulh $23, $21, $3 + addq $20, $6, $20 + cmpult $20, $6, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $9, $5, $9 + mulq $25, $27, $1 + umulh $25, $27, $7 + addq $20, $1, $20 + cmpult $20, $1, $18 + addq $18, $7, $7 + addq $19, $7, $19 + cmpult $19, $7, $17 + addq $9, $17, $9 + mulq $28, $24, $4 + umulh $28, $24, $0 + addq $20, $4, $20 + cmpult $20, $4, $10 + addq $10, $0, $0 + addq $19, $0, $19 + cmpult $19, $0, $8 + addq $9, $8, $9 + stq $20, 96($16) + bis $31, $31, $20 + mulq $25, $21, $22 + umulh $25, $21, $6 + addq $19, $22, $19 + cmpult $19, $22, $2 + addq $2, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $20, $3, $20 + mulq $28, $27, $5 + umulh $28, $27, $23 + addq $19, $5, $19 + cmpult $19, $5, $1 + addq $1, $23, $23 + addq $9, $23, $9 + cmpult $9, $23, $18 + addq $20, $18, $20 + stq $19, 104($16) + bis $31, $31, $19 + mulq $28, $21, $7 + umulh $28, $21, $17 + addq $9, $7, $9 + cmpult $9, $7, $4 + addq $4, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 112($16) + stq $20, 120($16) + ldq $9, 8($30) + ldq $10, 16($30) + ret $31,($26),1 + .end bn_mul_comba8 diff --git a/crypto/bn/asm/mips1.s b/crypto/bn/asm/mips1.s new file mode 100644 index 0000000000..44fa1254c7 --- /dev/null +++ b/crypto/bn/asm/mips1.s @@ -0,0 +1,539 @@ +/* This assember is for R2000/R3000 machines, or higher ones that do + * no want to do any 64 bit arithmatic. + * Make sure that the SSLeay bignum library is compiled with + * THIRTY_TWO_BIT set. + * This must either be compiled with the system CC, or, if you use GNU gas, + * cc -E mips1.s|gas -o mips1.o + */ + .set reorder + .set noat + +#define R1 $1 +#define CC $2 +#define R2 $3 +#define R3 $8 +#define R4 $9 +#define L1 $10 +#define L2 $11 +#define L3 $12 +#define L4 $13 +#define H1 $14 +#define H2 $15 +#define H3 $24 +#define H4 $25 + +#define P1 $4 +#define P2 $5 +#define P3 $6 +#define P4 $7 + + .align 2 + .ent bn_mul_add_words + .globl bn_mul_add_words +.text +bn_mul_add_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + #blt P3,4,$lab34 + + subu R1,P3,4 + move CC,$0 + bltz R1,$lab34 +$lab2: + lw R1,0(P1) + lw L1,0(P2) + lw R2,4(P1) + lw L2,4(P2) + lw R3,8(P1) + lw L3,8(P2) + lw R4,12(P1) + lw L4,12(P2) + multu L1,P4 + addu R1,R1,CC + mflo L1 + sltu CC,R1,CC + addu R1,R1,L1 + mfhi H1 + sltu L1,R1,L1 + sw R1,0(P1) + addu CC,CC,L1 + multu L2,P4 + addu CC,H1,CC + mflo L2 + addu R2,R2,CC + sltu CC,R2,CC + mfhi H2 + addu R2,R2,L2 + addu P2,P2,16 + sltu L2,R2,L2 + sw R2,4(P1) + addu CC,CC,L2 + multu L3,P4 + addu CC,H2,CC + mflo L3 + addu R3,R3,CC + sltu CC,R3,CC + mfhi H3 + addu R3,R3,L3 + addu P1,P1,16 + sltu L3,R3,L3 + sw R3,-8(P1) + addu CC,CC,L3 + multu L4,P4 + addu CC,H3,CC + mflo L4 + addu R4,R4,CC + sltu CC,R4,CC + mfhi H4 + addu R4,R4,L4 + subu P3,P3,4 + sltu L4,R4,L4 + addu CC,CC,L4 + addu CC,H4,CC + + subu R1,P3,4 + sw R4,-4(P1) # delay slot + bgez R1,$lab2 + + bleu P3,0,$lab3 + .align 2 +$lab33: + lw L1,0(P2) + lw R1,0(P1) + multu L1,P4 + addu R1,R1,CC + sltu CC,R1,CC + addu P1,P1,4 + mflo L1 + mfhi H1 + addu R1,R1,L1 + addu P2,P2,4 + sltu L1,R1,L1 + subu P3,P3,1 + addu CC,CC,L1 + sw R1,-4(P1) + addu CC,H1,CC + bgtz P3,$lab33 + j $31 + .align 2 +$lab3: + j $31 + .align 2 +$lab34: + bgt P3,0,$lab33 + j $31 + .end bn_mul_add_words + + .align 2 + # Program Unit: bn_mul_words + .ent bn_mul_words + .globl bn_mul_words +.text +bn_mul_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + move CC,$0 + bltz P3,$lab45 +$lab44: + lw L1,0(P2) + lw L2,4(P2) + lw L3,8(P2) + lw L4,12(P2) + multu L1,P4 + subu P3,P3,4 + mflo L1 + mfhi H1 + addu L1,L1,CC + multu L2,P4 + sltu CC,L1,CC + sw L1,0(P1) + addu CC,H1,CC + mflo L2 + mfhi H2 + addu L2,L2,CC + multu L3,P4 + sltu CC,L2,CC + sw L2,4(P1) + addu CC,H2,CC + mflo L3 + mfhi H3 + addu L3,L3,CC + multu L4,P4 + sltu CC,L3,CC + sw L3,8(P1) + addu CC,H3,CC + mflo L4 + mfhi H4 + addu L4,L4,CC + addu P1,P1,16 + sltu CC,L4,CC + addu P2,P2,16 + addu CC,H4,CC + sw L4,-4(P1) + + bgez P3,$lab44 + b $lab45 +$lab46: + lw L1,0(P2) + addu P1,P1,4 + multu L1,P4 + addu P2,P2,4 + mflo L1 + mfhi H1 + addu L1,L1,CC + subu P3,P3,1 + sltu CC,L1,CC + sw L1,-4(P1) + addu CC,H1,CC + bgtz P3,$lab46 + j $31 +$lab45: + addu P3,P3,4 + bgtz P3,$lab46 + j $31 + .align 2 + .end bn_mul_words + + # Program Unit: bn_sqr_words + .ent bn_sqr_words + .globl bn_sqr_words +.text +bn_sqr_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + bltz P3,$lab55 +$lab54: + lw L1,0(P2) + lw L2,4(P2) + lw L3,8(P2) + lw L4,12(P2) + + multu L1,L1 + subu P3,P3,4 + mflo L1 + mfhi H1 + sw L1,0(P1) + sw H1,4(P1) + + multu L2,L2 + addu P1,P1,32 + mflo L2 + mfhi H2 + sw L2,-24(P1) + sw H2,-20(P1) + + multu L3,L3 + addu P2,P2,16 + mflo L3 + mfhi H3 + sw L3,-16(P1) + sw H3,-12(P1) + + multu L4,L4 + + mflo L4 + mfhi H4 + sw L4,-8(P1) + sw H4,-4(P1) + + bgtz P3,$lab54 + b $lab55 +$lab56: + lw L1,0(P2) + addu P1,P1,8 + multu L1,L1 + addu P2,P2,4 + subu P3,P3,1 + mflo L1 + mfhi H1 + sw L1,-8(P1) + sw H1,-4(P1) + + bgtz P3,$lab56 + j $31 +$lab55: + addu P3,P3,4 + bgtz P3,$lab56 + j $31 + .align 2 + .end bn_sqr_words + + # Program Unit: bn_add_words + .ent bn_add_words + .globl bn_add_words +.text +bn_add_words: # 0x590 + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P4,P4,4 + move CC,$0 + bltz P4,$lab65 +$lab64: + lw L1,0(P2) + lw R1,0(P3) + lw L2,4(P2) + lw R2,4(P3) + + addu L1,L1,CC + lw L3,8(P2) + sltu CC,L1,CC + addu L1,L1,R1 + sltu R1,L1,R1 + lw R3,8(P3) + addu CC,CC,R1 + lw L4,12(P2) + + addu L2,L2,CC + lw R4,12(P3) + sltu CC,L2,CC + addu L2,L2,R2 + sltu R2,L2,R2 + sw L1,0(P1) + addu CC,CC,R2 + addu P1,P1,16 + addu L3,L3,CC + sw L2,-12(P1) + + sltu CC,L3,CC + addu L3,L3,R3 + sltu R3,L3,R3 + addu P2,P2,16 + addu CC,CC,R3 + + addu L4,L4,CC + addu P3,P3,16 + sltu CC,L4,CC + addu L4,L4,R4 + subu P4,P4,4 + sltu R4,L4,R4 + sw L3,-8(P1) + addu CC,CC,R4 + sw L4,-4(P1) + + bgtz P4,$lab64 + b $lab65 +$lab66: + lw L1,0(P2) + lw R1,0(P3) + addu L1,L1,CC + addu P1,P1,4 + sltu CC,L1,CC + addu P2,P2,4 + addu P3,P3,4 + addu L1,L1,R1 + subu P4,P4,1 + sltu R1,L1,R1 + sw L1,-4(P1) + addu CC,CC,R1 + + bgtz P4,$lab66 + j $31 +$lab65: + addu P4,P4,4 + bgtz P4,$lab66 + j $31 + .end bn_add_words + + # Program Unit: bn_div64 + .set at + .set reorder + .text + .align 2 + .globl bn_div64 + # 321 { + .ent bn_div64 2 +bn_div64: + subu $sp, 64 + sw $31, 56($sp) + sw $16, 48($sp) + .mask 0x80010000, -56 + .frame $sp, 64, $31 + move $9, $4 + move $12, $5 + move $16, $6 + # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t; + move $31, $0 + # 323 int i,count=2; + li $13, 2 + # 324 + # 325 if (d == 0) return(BN_MASK2); + bne $16, 0, $80 + li $2, -1 + b $93 +$80: + # 326 + # 327 i=BN_num_bits_word(d); + move $4, $16 + sw $31, 16($sp) + sw $9, 24($sp) + sw $12, 32($sp) + sw $13, 40($sp) + .livereg 0x800ff0e,0xfff + jal BN_num_bits_word + li $4, 32 + lw $31, 16($sp) + lw $9, 24($sp) + lw $12, 32($sp) + lw $13, 40($sp) + move $3, $2 + # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i)) + beq $2, $4, $81 + li $14, 1 + sll $15, $14, $2 + bleu $9, $15, $81 + # 329 { + # 330 #if !defined(NO_STDIO) && !defined(WIN16) + # 331 fprintf(stderr,"Division would overflow (%d)\n",i); + # 332 #endif + # 333 abort(); + sw $3, 8($sp) + sw $9, 24($sp) + sw $12, 32($sp) + sw $13, 40($sp) + sw $31, 26($sp) + .livereg 0xff0e,0xfff + jal abort + lw $3, 8($sp) + li $4, 32 + lw $9, 24($sp) + lw $12, 32($sp) + lw $13, 40($sp) + lw $31, 26($sp) + # 334 } +$81: + # 335 i=BN_BITS2-i; + subu $3, $4, $3 + # 336 if (h >= d) h-=d; + bltu $9, $16, $82 + subu $9, $9, $16 +$82: + # 337 + # 338 if (i) + beq $3, 0, $83 + # 339 { + # 340 d<<=i; + sll $16, $16, $3 + # 341 h=(h<<i)|(l>>(BN_BITS2-i)); + sll $24, $9, $3 + subu $25, $4, $3 + srl $14, $12, $25 + or $9, $24, $14 + # 342 l<<=i; + sll $12, $12, $3 + # 343 } +$83: + # 344 dh=(d&BN_MASK2h)>>BN_BITS4; + # 345 dl=(d&BN_MASK2l); + and $8, $16, -65536 + srl $8, $8, 16 + and $10, $16, 65535 + li $6, -65536 +$84: + # 346 for (;;) + # 347 { + # 348 if ((h>>BN_BITS4) == dh) + srl $15, $9, 16 + bne $8, $15, $85 + # 349 q=BN_MASK2l; + li $5, 65535 + b $86 +$85: + # 350 else + # 351 q=h/dh; + divu $5, $9, $8 +$86: + # 352 + # 353 for (;;) + # 354 { + # 355 t=(h-q*dh); + mul $4, $5, $8 + subu $2, $9, $4 + move $3, $2 + # 356 if ((t&BN_MASK2h) || + # 357 ((dl*q) <= ( + # 358 (t<<BN_BITS4)+ + # 359 ((l&BN_MASK2h)>>BN_BITS4)))) + and $25, $2, $6 + bne $25, $0, $87 + mul $24, $10, $5 + sll $14, $3, 16 + and $15, $12, $6 + srl $25, $15, 16 + addu $15, $14, $25 + bgtu $24, $15, $88 +$87: + # 360 break; + mul $3, $10, $5 + b $89 +$88: + # 361 q--; + addu $5, $5, -1 + # 362 } + b $86 +$89: + # 363 th=q*dh; + # 364 tl=q*dl; + # 365 t=(tl>>BN_BITS4); + # 366 tl=(tl<<BN_BITS4)&BN_MASK2h; + sll $14, $3, 16 + and $2, $14, $6 + move $11, $2 + # 367 th+=t; + srl $25, $3, 16 + addu $7, $4, $25 + # 368 + # 369 if (l < tl) th++; + bgeu $12, $2, $90 + addu $7, $7, 1 +$90: + # 370 l-=tl; + subu $12, $12, $11 + # 371 if (h < th) + bgeu $9, $7, $91 + # 372 { + # 373 h+=d; + addu $9, $9, $16 + # 374 q--; + addu $5, $5, -1 + # 375 } +$91: + # 376 h-=th; + subu $9, $9, $7 + # 377 + # 378 if (--count == 0) break; + addu $13, $13, -1 + beq $13, 0, $92 + # 379 + # 380 ret=q<<BN_BITS4; + sll $31, $5, 16 + # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; + sll $24, $9, 16 + srl $15, $12, 16 + or $9, $24, $15 + # 382 l=(l&BN_MASK2l)<<BN_BITS4; + and $12, $12, 65535 + sll $12, $12, 16 + # 383 } + b $84 +$92: + # 384 ret|=q; + or $31, $31, $5 + # 385 return(ret); + move $2, $31 +$93: + lw $16, 48($sp) + lw $31, 56($sp) + addu $sp, 64 + j $31 + .end bn_div64 + diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s new file mode 100644 index 0000000000..e8fdd50d16 --- /dev/null +++ b/crypto/bn/asm/mips3.s @@ -0,0 +1,544 @@ +/* This assember is for R4000 and above machines. It takes advantage + * of the 64 bit registers present on these CPUs. + * Make sure that the SSLeay bignum library is compiled with + * SIXTY_FOUR_BIT set and BN_LLONG undefined. + * This must either be compiled with the system CC, or, if you use GNU gas, + * cc -E mips3.s|gas -o mips3.o + */ + .set reorder + .set noat + +#define R1 $1 +#define CC $2 +#define R2 $3 +#define R3 $8 +#define R4 $9 +#define L1 $10 +#define L2 $11 +#define L3 $12 +#define L4 $13 +#define H1 $14 +#define H2 $15 +#define H3 $24 +#define H4 $25 + +#define P1 $4 +#define P2 $5 +#define P3 $6 +#define P4 $7 + + .align 2 + .ent bn_mul_add_words + .globl bn_mul_add_words +.text +bn_mul_add_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + #blt P3,4,$lab34 + + subu R1,P3,4 + move CC,$0 + bltz R1,$lab34 +$lab2: + ld R1,0(P1) + ld L1,0(P2) + ld R2,8(P1) + ld L2,8(P2) + ld R3,16(P1) + ld L3,16(P2) + ld R4,24(P1) + ld L4,24(P2) + dmultu L1,P4 + daddu R1,R1,CC + mflo L1 + sltu CC,R1,CC + daddu R1,R1,L1 + mfhi H1 + sltu L1,R1,L1 + sd R1,0(P1) + daddu CC,CC,L1 + dmultu L2,P4 + daddu CC,H1,CC + mflo L2 + daddu R2,R2,CC + sltu CC,R2,CC + mfhi H2 + daddu R2,R2,L2 + daddu P2,P2,32 + sltu L2,R2,L2 + sd R2,8(P1) + daddu CC,CC,L2 + dmultu L3,P4 + daddu CC,H2,CC + mflo L3 + daddu R3,R3,CC + sltu CC,R3,CC + mfhi H3 + daddu R3,R3,L3 + daddu P1,P1,32 + sltu L3,R3,L3 + sd R3,-16(P1) + daddu CC,CC,L3 + dmultu L4,P4 + daddu CC,H3,CC + mflo L4 + daddu R4,R4,CC + sltu CC,R4,CC + mfhi H4 + daddu R4,R4,L4 + subu P3,P3,4 + sltu L4,R4,L4 + daddu CC,CC,L4 + daddu CC,H4,CC + + subu R1,P3,4 + sd R4,-8(P1) # delay slot + bgez R1,$lab2 + + bleu P3,0,$lab3 + .align 2 +$lab33: + ld L1,0(P2) + ld R1,0(P1) + dmultu L1,P4 + daddu R1,R1,CC + sltu CC,R1,CC + daddu P1,P1,8 + mflo L1 + mfhi H1 + daddu R1,R1,L1 + daddu P2,P2,8 + sltu L1,R1,L1 + subu P3,P3,1 + daddu CC,CC,L1 + sd R1,-8(P1) + daddu CC,H1,CC + bgtz P3,$lab33 + j $31 + .align 2 +$lab3: + j $31 + .align 2 +$lab34: + bgt P3,0,$lab33 + j $31 + .end bn_mul_add_words + + .align 2 + # Program Unit: bn_mul_words + .ent bn_mul_words + .globl bn_mul_words +.text +bn_mul_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + move CC,$0 + bltz P3,$lab45 +$lab44: + ld L1,0(P2) + ld L2,8(P2) + ld L3,16(P2) + ld L4,24(P2) + dmultu L1,P4 + subu P3,P3,4 + mflo L1 + mfhi H1 + daddu L1,L1,CC + dmultu L2,P4 + sltu CC,L1,CC + sd L1,0(P1) + daddu CC,H1,CC + mflo L2 + mfhi H2 + daddu L2,L2,CC + dmultu L3,P4 + sltu CC,L2,CC + sd L2,8(P1) + daddu CC,H2,CC + mflo L3 + mfhi H3 + daddu L3,L3,CC + dmultu L4,P4 + sltu CC,L3,CC + sd L3,16(P1) + daddu CC,H3,CC + mflo L4 + mfhi H4 + daddu L4,L4,CC + daddu P1,P1,32 + sltu CC,L4,CC + daddu P2,P2,32 + daddu CC,H4,CC + sd L4,-8(P1) + + bgez P3,$lab44 + b $lab45 +$lab46: + ld L1,0(P2) + daddu P1,P1,8 + dmultu L1,P4 + daddu P2,P2,8 + mflo L1 + mfhi H1 + daddu L1,L1,CC + subu P3,P3,1 + sltu CC,L1,CC + sd L1,-8(P1) + daddu CC,H1,CC + bgtz P3,$lab46 + j $31 +$lab45: + addu P3,P3,4 + bgtz P3,$lab46 + j $31 + .align 2 + .end bn_mul_words + + # Program Unit: bn_sqr_words + .ent bn_sqr_words + .globl bn_sqr_words +.text +bn_sqr_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + b $lab55 + bltz P3,$lab55 +$lab54: + ld L1,0(P2) + ld L2,8(P2) + ld L3,16(P2) + ld L4,24(P2) + + dmultu L1,L1 + subu P3,P3,4 + mflo L1 + mfhi H1 + sd L1,0(P1) + sd H1,8(P1) + + dmultu L2,L2 + daddu P1,P1,32 + mflo L2 + mfhi H2 + sd L2,-48(P1) + sd H2,-40(P1) + + dmultu L3,L3 + daddu P2,P2,32 + mflo L3 + mfhi H3 + sd L3,-32(P1) + sd H3,-24(P1) + + dmultu L4,L4 + + mflo L4 + mfhi H4 + sd L4,-16(P1) + sd H4,-8(P1) + + bgtz P3,$lab54 + b $lab55 +$lab56: + ld L1,0(P2) + daddu P1,P1,16 + dmultu L1,L1 + daddu P2,P2,8 + subu P3,P3,1 + mflo L1 + mfhi H1 + sd L1,-16(P1) + sd H1,-8(P1) + + bgtz P3,$lab56 + j $31 +$lab55: + daddu P3,P3,4 + bgtz P3,$lab56 + j $31 + .align 2 + .end bn_sqr_words + + # Program Unit: bn_add_words + .ent bn_add_words + .globl bn_add_words +.text +bn_add_words: # 0x590 + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P4,P4,4 + move CC,$0 + bltz P4,$lab65 +$lab64: + ld L1,0(P2) + ld R1,0(P3) + ld L2,8(P2) + ld R2,8(P3) + + daddu L1,L1,CC + ld L3,16(P2) + sltu CC,L1,CC + daddu L1,L1,R1 + sltu R1,L1,R1 + ld R3,16(P3) + daddu CC,CC,R1 + ld L4,24(P2) + + daddu L2,L2,CC + ld R4,24(P3) + sltu CC,L2,CC + daddu L2,L2,R2 + sltu R2,L2,R2 + sd L1,0(P1) + daddu CC,CC,R2 + daddu P1,P1,32 + daddu L3,L3,CC + sd L2,-24(P1) + + sltu CC,L3,CC + daddu L3,L3,R3 + sltu R3,L3,R3 + daddu P2,P2,32 + daddu CC,CC,R3 + + daddu L4,L4,CC + daddu P3,P3,32 + sltu CC,L4,CC + daddu L4,L4,R4 + sltu R4,L4,R4 + subu P4,P4,4 + sd L3,-16(P1) + daddu CC,CC,R4 + sd L4,-8(P1) + + bgtz P4,$lab64 + b $lab65 +$lab66: + ld L1,0(P2) + ld R1,0(P3) + daddu L1,L1,CC + daddu P1,P1,8 + sltu CC,L1,CC + daddu P2,P2,8 + daddu P3,P3,8 + daddu L1,L1,R1 + subu P4,P4,1 + sltu R1,L1,R1 + sd L1,-8(P1) + daddu CC,CC,R1 + + bgtz P4,$lab66 + j $31 +$lab65: + addu P4,P4,4 + bgtz P4,$lab66 + j $31 + .end bn_add_words + +#if 1 + # Program Unit: bn_div64 + .set at + .set reorder + .text + .align 2 + .globl bn_div64 + # 321 { + .ent bn_div64 +bn_div64: + dsubu $sp, 64 + sd $31, 56($sp) + sd $16, 48($sp) + .mask 0x80010000, -56 + .frame $sp, 64, $31 + move $9, $4 + move $12, $5 + move $16, $6 + # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t; + move $31, $0 + # 323 int i,count=2; + li $13, 2 + # 324 + # 325 if (d == 0) return(BN_MASK2); + bne $16, 0, $80 + dli $2, -1 + b $93 +$80: + # 326 + # 327 i=BN_num_bits_word(d); + move $4, $16 + sd $31, 16($sp) + sd $9, 24($sp) + sd $12, 32($sp) + sd $13, 40($sp) + .livereg 0x800ff0e,0xfff + jal BN_num_bits_word + dli $4, 64 + ld $31, 16($sp) + ld $9, 24($sp) + ld $12, 32($sp) + ld $13, 40($sp) + move $3, $2 + # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i)) + beq $2, $4, $81 + dli $14, 1 + dsll $15, $14, $2 + bleu $9, $15, $81 + # 329 { + # 330 #if !defined(NO_STDIO) && !defined(WIN16) + # 331 fprintf(stderr,"Division would overflow (%d)\n",i); + # 332 #endif + # 333 abort(); + sd $3, 8($sp) + sd $31, 16($sp) + sd $9, 24($sp) + sd $12, 32($sp) + sd $13, 40($sp) + .livereg 0xff0e,0xfff + jal abort + dli $4, 64 + ld $3, 8($sp) + ld $31, 16($sp) + ld $9, 24($sp) + ld $12, 32($sp) + ld $13, 40($sp) + # 334 } +$81: + # 335 i=BN_BITS2-i; + dsubu $3, $4, $3 + # 336 if (h >= d) h-=d; + bltu $9, $16, $82 + dsubu $9, $9, $16 +$82: + # 337 + # 338 if (i) + beq $3, 0, $83 + # 339 { + # 340 d<<=i; + dsll $16, $16, $3 + # 341 h=(h<<i)|(l>>(BN_BITS2-i)); + dsll $24, $9, $3 + dsubu $25, $4, $3 + dsrl $14, $12, $25 + or $9, $24, $14 + # 342 l<<=i; + dsll $12, $12, $3 + # 343 } +$83: + # 344 dh=(d&BN_MASK2h)>>BN_BITS4; + # 345 dl=(d&BN_MASK2l); + and $8, $16,0xFFFFFFFF00000000 + dsrl $8, $8, 32 + # dli $10,0xFFFFFFFF # Is this needed? + # and $10, $16, $10 + dsll $10, $16, 32 + dsrl $10, $10, 32 + dli $6,0xFFFFFFFF00000000 +$84: + # 346 for (;;) + # 347 { + # 348 if ((h>>BN_BITS4) == dh) + dsrl $15, $9, 32 + bne $8, $15, $85 + # 349 q=BN_MASK2l; + dli $5, 0xFFFFFFFF + b $86 +$85: + # 350 else + # 351 q=h/dh; + ddivu $5, $9, $8 +$86: + # 352 + # 353 for (;;) + # 354 { + # 355 t=(h-q*dh); + dmul $4, $5, $8 + dsubu $2, $9, $4 + move $3, $2 + # 356 if ((t&BN_MASK2h) || + # 357 ((dl*q) <= ( + # 358 (t<<BN_BITS4)+ + # 359 ((l&BN_MASK2h)>>BN_BITS4)))) + and $25, $2, $6 + bne $25, $0, $87 + dmul $24, $10, $5 + dsll $14, $3, 32 + and $15, $12, $6 + dsrl $25, $15, 32 + daddu $15, $14, $25 + bgtu $24, $15, $88 +$87: + # 360 break; + dmul $3, $10, $5 + b $89 +$88: + # 361 q--; + daddu $5, $5, -1 + # 362 } + b $86 +$89: + # 363 th=q*dh; + # 364 tl=q*dl; + # 365 t=(tl>>BN_BITS4); + # 366 tl=(tl<<BN_BITS4)&BN_MASK2h; + dsll $14, $3, 32 + and $2, $14, $6 + move $11, $2 + # 367 th+=t; + dsrl $25, $3, 32 + daddu $7, $4, $25 + # 368 + # 369 if (l < tl) th++; + bgeu $12, $2, $90 + daddu $7, $7, 1 +$90: + # 370 l-=tl; + dsubu $12, $12, $11 + # 371 if (h < th) + bgeu $9, $7, $91 + # 372 { + # 373 h+=d; + daddu $9, $9, $16 + # 374 q--; + daddu $5, $5, -1 + # 375 } +$91: + # 376 h-=th; + dsubu $9, $9, $7 + # 377 + # 378 if (--count == 0) break; + addu $13, $13, -1 + beq $13, 0, $92 + # 379 + # 380 ret=q<<BN_BITS4; + dsll $31, $5, 32 + # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; + dsll $24, $9, 32 + dsrl $15, $12, 32 + or $9, $24, $15 + # 382 l=(l&BN_MASK2l)<<BN_BITS4; + and $12, $12, 0xFFFFFFFF + dsll $12, $12, 32 + # 383 } + b $84 +$92: + # 384 ret|=q; + or $31, $31, $5 + # 385 return(ret); + move $2, $31 +$93: + ld $16, 48($sp) + ld $31, 56($sp) + daddu $sp, 64 + j $31 + .end bn_div64 +#endif diff --git a/crypto/bn/asm/x86.pl b/crypto/bn/asm/x86.pl new file mode 100644 index 0000000000..bf869fd0ee --- /dev/null +++ b/crypto/bn/asm/x86.pl @@ -0,0 +1,28 @@ +#!/usr/local/bin/perl + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +require("x86/mul_add.pl"); +require("x86/mul.pl"); +require("x86/sqr.pl"); +require("x86/div.pl"); +require("x86/add.pl"); +require("x86/sub.pl"); +require("x86/comba.pl"); + +&asm_init($ARGV[0],"bn-586.pl"); + +&bn_mul_add_words("bn_mul_add_words"); +&bn_mul_words("bn_mul_words"); +&bn_sqr_words("bn_sqr_words"); +&bn_div_words("bn_div_words"); +&bn_add_words("bn_add_words"); +&bn_sub_words("bn_sub_words"); +&bn_mul_comba("bn_mul_comba8",8); +&bn_mul_comba("bn_mul_comba4",4); +&bn_sqr_comba("bn_sqr_comba8",8); +&bn_sqr_comba("bn_sqr_comba4",4); + +&asm_finish(); + diff --git a/crypto/bn/asm/x86/add.pl b/crypto/bn/asm/x86/add.pl new file mode 100644 index 0000000000..0b5cf583e3 --- /dev/null +++ b/crypto/bn/asm/x86/add.pl @@ -0,0 +1,76 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_add_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $a="esi"; + $b="edi"; + $c="eax"; + $r="ebx"; + $tmp1="ecx"; + $tmp2="edx"; + $num="ebp"; + + &mov($r,&wparam(0)); # get r + &mov($a,&wparam(1)); # get a + &mov($b,&wparam(2)); # get b + &mov($num,&wparam(3)); # get num + &xor($c,$c); # clear carry + &and($num,0xfffffff8); # num / 8 + + &jz(&label("aw_finish")); + + &set_label("aw_loop",0); + for ($i=0; $i<8; $i++) + { + &comment("Round $i"); + + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0)); # *b + &add($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &add($tmp1,$tmp2); + &adc($c,0); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *r + } + + &comment(""); + &add($a,32); + &add($b,32); + &add($r,32); + &sub($num,8); + &jnz(&label("aw_loop")); + + &set_label("aw_finish",0); + &mov($num,&wparam(3)); # get num + &and($num,7); + &jz(&label("aw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0));# *b + &add($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &add($tmp1,$tmp2); + &adc($c,0); + &dec($num) if ($i != 6); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *a + &jz(&label("aw_end")) if ($i != 6); + } + &set_label("aw_end",0); + +# &mov("eax",$c); # $c is "eax" + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/comba.pl b/crypto/bn/asm/x86/comba.pl new file mode 100644 index 0000000000..2291253629 --- /dev/null +++ b/crypto/bn/asm/x86/comba.pl @@ -0,0 +1,277 @@ +#!/usr/local/bin/perl +# x86 assember + +sub mul_add_c + { + local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("mul a[$ai]*b[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + &mul("edx"); + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a + &mov("eax",&wparam(0)) if $pos > 0; # load r[] + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a + } + +sub sqr_add_c + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + } + +sub sqr_add_c2 + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$a,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add("eax","eax"); + ### + &adc("edx","edx"); + ### + &adc($c2,0); + &add($c0,"eax"); + &adc($c1,"edx"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + &adc($c2,0); + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); + ### + } + +sub bn_mul_comba + { + local($name,$num)=@_; + local($a,$b,$c0,$c1,$c2); + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($tot,$end); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $b="edi"; + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + &push("esi"); + &mov($a,&wparam(1)); + &push("edi"); + &mov($b,&wparam(2)); + &push("ebp"); + &push("ebx"); + + &xor($c0,$c0); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + &xor($c1,$c1); + &mov("edx",&DWP(0,$b,"",0)); # load the first second + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("################## Calculate word $i"); + + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($j+1) == $end) + { + $v=1; + $v=2 if (($i+1) == $tot); + } + else + { $v=0; } + if (($j+1) != $end) + { + $na=($ai-1); + $nb=($bi+1); + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } +#printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; + &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); + if ($v) + { + &comment("saved r[$i]"); + # &mov("eax",&wparam(0)); + # &mov(&DWP($i*4,"eax","",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &comment("save r[$i]"); + # &mov("eax",&wparam(0)); + &mov(&DWP($i*4,"eax","",0),$c0); + + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } + +sub bn_sqr_comba + { + local($name,$num)=@_; + local($r,$a,$c0,$c1,$c2)=@_; + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($b,$tot,$end,$half); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $r="edi"; + + &push("esi"); + &push("edi"); + &push("ebp"); + &push("ebx"); + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &xor($c0,$c0); + &xor($c1,$c1); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("############### Calculate word $i"); + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($ai-1) < ($bi+1)) + { + $v=1; + $v=2 if ($i+1) == $tot; + } + else + { $v=0; } + if (!$v) + { + $na=$ai-1; + $nb=$bi+1; + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } + if ($ai == $bi) + { + &sqr_add_c($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + else + { + &sqr_add_c2($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + if ($v) + { + &comment("saved r[$i]"); + #&mov(&DWP($i*4,$r,"",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + last; + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &mov(&DWP($i*4,$r,"",0),$c0); + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } + +1; diff --git a/crypto/bn/asm/x86/div.pl b/crypto/bn/asm/x86/div.pl new file mode 100644 index 0000000000..0e90152caa --- /dev/null +++ b/crypto/bn/asm/x86/div.pl @@ -0,0 +1,15 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_div_words + { + local($name)=@_; + + &function_begin($name,""); + &mov("edx",&wparam(0)); # + &mov("eax",&wparam(1)); # + &mov("ebx",&wparam(2)); # + &div("ebx"); + &function_end($name); + } +1; diff --git a/crypto/bn/asm/x86/f b/crypto/bn/asm/x86/f new file mode 100644 index 0000000000..22e4112224 --- /dev/null +++ b/crypto/bn/asm/x86/f @@ -0,0 +1,3 @@ +#!/usr/local/bin/perl +# x86 assember + diff --git a/crypto/bn/asm/x86/mul.pl b/crypto/bn/asm/x86/mul.pl new file mode 100644 index 0000000000..674cb9b055 --- /dev/null +++ b/crypto/bn/asm/x86/mul.pl @@ -0,0 +1,77 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_mul_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $Low="eax"; + $High="edx"; + $a="ebx"; + $w="ecx"; + $r="edi"; + $c="esi"; + $num="ebp"; + + &xor($c,$c); # clear carry + &mov($r,&wparam(0)); # + &mov($a,&wparam(1)); # + &mov($num,&wparam(2)); # + &mov($w,&wparam(3)); # + + &and($num,0xfffffff8); # num / 8 + &jz(&label("mw_finish")); + + &set_label("mw_loop",0); + for ($i=0; $i<32; $i+=4) + { + &comment("Round $i"); + + &mov("eax",&DWP($i,$a,"",0)); # *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+=c + # XXX + + &adc("edx",0); # H(t)+=carry + &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); + + &mov($c,"edx"); # c= H(t); + } + + &comment(""); + &add($a,32); + &add($r,32); + &sub($num,8); + &jz(&label("mw_finish")); + &jmp(&label("mw_loop")); + + &set_label("mw_finish",0); + &mov($num,&wparam(2)); # get num + &and($num,7); + &jnz(&label("mw_finish2")); + &jmp(&label("mw_end")); + + &set_label("mw_finish2",1); + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov("eax",&DWP($i*4,$a,"",0));# *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+=c + # XXX + &adc("edx",0); # H(t)+=carry + &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); + &mov($c,"edx"); # c= H(t); + &dec($num) if ($i != 7-1); + &jz(&label("mw_end")) if ($i != 7-1); + } + &set_label("mw_end",0); + &mov("eax",$c); + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/mul_add.pl b/crypto/bn/asm/x86/mul_add.pl new file mode 100644 index 0000000000..61830d3a90 --- /dev/null +++ b/crypto/bn/asm/x86/mul_add.pl @@ -0,0 +1,87 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_mul_add_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $Low="eax"; + $High="edx"; + $a="ebx"; + $w="ebp"; + $r="edi"; + $c="esi"; + + &xor($c,$c); # clear carry + &mov($r,&wparam(0)); # + + &mov("ecx",&wparam(2)); # + &mov($a,&wparam(1)); # + + &and("ecx",0xfffffff8); # num / 8 + &mov($w,&wparam(3)); # + + &push("ecx"); # Up the stack for a tmp variable + + &jz(&label("maw_finish")); + + &set_label("maw_loop",0); + + &mov(&swtmp(0),"ecx"); # + + for ($i=0; $i<32; $i+=4) + { + &comment("Round $i"); + + &mov("eax",&DWP($i,$a,"",0)); # *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+= *r + &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r + &adc("edx",0); # H(t)+=carry + &add("eax",$c); # L(t)+=c + &adc("edx",0); # H(t)+=carry + &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); + &mov($c,"edx"); # c= H(t); + } + + &comment(""); + &mov("ecx",&swtmp(0)); # + &add($a,32); + &add($r,32); + &sub("ecx",8); + &jnz(&label("maw_loop")); + + &set_label("maw_finish",0); + &mov("ecx",&wparam(2)); # get num + &and("ecx",7); + &jnz(&label("maw_finish2")); # helps branch prediction + &jmp(&label("maw_end")); + + &set_label("maw_finish2",1); + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov("eax",&DWP($i*4,$a,"",0));# *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+=c + &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r + &adc("edx",0); # H(t)+=carry + &add("eax",$c); + &adc("edx",0); # H(t)+=carry + &dec("ecx") if ($i != 7-1); + &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); + &mov($c,"edx"); # c= H(t); + &jz(&label("maw_end")) if ($i != 7-1); + } + &set_label("maw_end",0); + &mov("eax",$c); + + &pop("ecx"); # clear variable from + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/sqr.pl b/crypto/bn/asm/x86/sqr.pl new file mode 100644 index 0000000000..1f90993cf6 --- /dev/null +++ b/crypto/bn/asm/x86/sqr.pl @@ -0,0 +1,60 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_sqr_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $r="esi"; + $a="edi"; + $num="ebx"; + + &mov($r,&wparam(0)); # + &mov($a,&wparam(1)); # + &mov($num,&wparam(2)); # + + &and($num,0xfffffff8); # num / 8 + &jz(&label("sw_finish")); + + &set_label("sw_loop",0); + for ($i=0; $i<32; $i+=4) + { + &comment("Round $i"); + &mov("eax",&DWP($i,$a,"",0)); # *a + # XXX + &mul("eax"); # *a * *a + &mov(&DWP($i*2,$r,"",0),"eax"); # + &mov(&DWP($i*2+4,$r,"",0),"edx");# + } + + &comment(""); + &add($a,32); + &add($r,64); + &sub($num,8); + &jnz(&label("sw_loop")); + + &set_label("sw_finish",0); + &mov($num,&wparam(2)); # get num + &and($num,7); + &jz(&label("sw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov("eax",&DWP($i*4,$a,"",0)); # *a + # XXX + &mul("eax"); # *a * *a + &mov(&DWP($i*8,$r,"",0),"eax"); # + &dec($num) if ($i != 7-1); + &mov(&DWP($i*8+4,$r,"",0),"edx"); + &jz(&label("sw_end")) if ($i != 7-1); + } + &set_label("sw_end",0); + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/sub.pl b/crypto/bn/asm/x86/sub.pl new file mode 100644 index 0000000000..837b0e1b07 --- /dev/null +++ b/crypto/bn/asm/x86/sub.pl @@ -0,0 +1,76 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_sub_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $a="esi"; + $b="edi"; + $c="eax"; + $r="ebx"; + $tmp1="ecx"; + $tmp2="edx"; + $num="ebp"; + + &mov($r,&wparam(0)); # get r + &mov($a,&wparam(1)); # get a + &mov($b,&wparam(2)); # get b + &mov($num,&wparam(3)); # get num + &xor($c,$c); # clear carry + &and($num,0xfffffff8); # num / 8 + + &jz(&label("aw_finish")); + + &set_label("aw_loop",0); + for ($i=0; $i<8; $i++) + { + &comment("Round $i"); + + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0)); # *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *r + } + + &comment(""); + &add($a,32); + &add($b,32); + &add($r,32); + &sub($num,8); + &jnz(&label("aw_loop")); + + &set_label("aw_finish",0); + &mov($num,&wparam(3)); # get num + &and($num,7); + &jz(&label("aw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0));# *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &dec($num) if ($i != 6); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *a + &jz(&label("aw_end")) if ($i != 6); + } + &set_label("aw_end",0); + +# &mov("eax",$c); # $c is "eax" + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86w16.asm b/crypto/bn/asm/x86w16.asm index 74a933a8cd..80a9ed6eef 100644 --- a/crypto/bn/asm/x86w16.asm +++ b/crypto/bn/asm/x86w16.asm @@ -6,11 +6,11 @@ F_TEXT SEGMENT WORD PUBLIC 'CODE' F_TEXT ENDS _DATA SEGMENT WORD PUBLIC 'DATA' _DATA ENDS -CONST SEGMENT WORD PUBLIC 'CONST' -CONST ENDS +_CONST SEGMENT WORD PUBLIC 'CONST' +_CONST ENDS _BSS SEGMENT WORD PUBLIC 'BSS' _BSS ENDS -DGROUP GROUP CONST, _BSS, _DATA +DGROUP GROUP _CONST, _BSS, _DATA ASSUME DS: DGROUP, SS: DGROUP F_TEXT SEGMENT ASSUME CS: F_TEXT diff --git a/crypto/bn/asm/x86w32.asm b/crypto/bn/asm/x86w32.asm index fc6f917714..957d71e3b1 100644 --- a/crypto/bn/asm/x86w32.asm +++ b/crypto/bn/asm/x86w32.asm @@ -6,11 +6,11 @@ F_TEXT SEGMENT WORD USE16 PUBLIC 'CODE' F_TEXT ENDS _DATA SEGMENT WORD USE16 PUBLIC 'DATA' _DATA ENDS -CONST SEGMENT WORD USE16 PUBLIC 'CONST' -CONST ENDS +_CONST SEGMENT WORD USE16 PUBLIC 'CONST' +_CONST ENDS _BSS SEGMENT WORD USE16 PUBLIC 'BSS' _BSS ENDS -DGROUP GROUP CONST, _BSS, _DATA +DGROUP GROUP _CONST, _BSS, _DATA ASSUME DS: DGROUP, SS: DGROUP F_TEXT SEGMENT ASSUME CS: F_TEXT @@ -89,7 +89,7 @@ $L555: mov bp,WORD PTR [bp+26] ; load num and bp,3 dec bp - js $L547 + js $L547m mov eax,ecx mul DWORD PTR es:[bx] ; w* *a @@ -100,7 +100,7 @@ $L555: mov DWORD PTR ds:[di],eax mov esi,edx dec bp - js $L547 ; Note that we are now testing for -1 + js $L547m ; Note that we are now testing for -1 ; mov eax,ecx mul DWORD PTR es:[bx+4] ; w* *a @@ -111,7 +111,7 @@ $L555: mov DWORD PTR ds:[di+4],eax mov esi,edx dec bp - js $L547 + js $L547m ; mov eax,ecx mul DWORD PTR es:[bx+8] ; w* *a @@ -121,7 +121,7 @@ $L555: adc edx,0 mov DWORD PTR ds:[di+8],eax mov esi,edx -$L547: +$L547m: mov eax,esi mov edx,esi shr edx,16 @@ -315,37 +315,35 @@ _bn_add_words PROC FAR ; ap = 22 ; rp = 18 xor esi,esi ;c=0; + mov bx,WORD PTR [bp+18] ; load low r mov si,WORD PTR [bp+22] ; load a mov es,WORD PTR [bp+24] ; load a mov di,WORD PTR [bp+26] ; load b mov ds,WORD PTR [bp+28] ; load b mov dx,WORD PTR [bp+30] ; load num - dec dx - js $L547 xor ecx,ecx + dec dx + js $L547a $L5477: - xor ebx,ebx mov eax,DWORD PTR es:[si] ; *a add eax,ecx - adc ebx,0 + mov ecx,0 + adc ecx,0 add si,4 ; a++ add eax,DWORD PTR ds:[di] ; + *b - mov ecx,ebx adc ecx,0 - add di,4 - mov bx,WORD PTR [bp+18] mov ds,WORD PTR [bp+20] + add di,4 mov DWORD PTR ds:[bx],eax - add bx,4 mov ds,WORD PTR [bp+28] - mov WORD PTR [bp+18],bx + add bx,4 dec dx - js $L547 ; Note that we are now testing for -1 + js $L547a ; Note that we are now testing for -1 jmp $L5477 ; -$L547: +$L547a: mov eax,ecx mov edx,ecx shr edx,16 diff --git a/crypto/bn/bn.err b/crypto/bn/bn.err index 7ccc247c41..ba5c9bc97e 100644 --- a/crypto/bn/bn.err +++ b/crypto/bn/bn.err @@ -16,12 +16,15 @@ #define BN_F_BN_MPI2BN 112 #define BN_F_BN_NEW 113 #define BN_F_BN_RAND 114 +#define BN_F_BN_USUB 115 /* Reason codes. */ -#define BN_R_BAD_RECIPROCAL 100 -#define BN_R_CALLED_WITH_EVEN_MODULUS 101 -#define BN_R_DIV_BY_ZERO 102 -#define BN_R_ENCODING_ERROR 103 -#define BN_R_INVALID_LENGTH 104 -#define BN_R_NOT_INITALISED 105 -#define BN_R_NO_INVERSE 106 +#define BN_R_ARG2_LT_ARG3 100 +#define BN_R_BAD_RECIPROCAL 101 +#define BN_R_CALLED_WITH_EVEN_MODULUS 102 +#define BN_R_DIV_BY_ZERO 103 +#define BN_R_ENCODING_ERROR 104 +#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105 +#define BN_R_INVALID_LENGTH 106 +#define BN_R_NOT_INITALISED 107 +#define BN_R_NO_INVERSE 108 diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 66dde285d6..2c14a1d582 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -77,6 +77,9 @@ extern "C" { #define BN_LLONG /* This comment stops Configure mutilating things */ #endif +#define BN_MUL_COMBA +#define BN_SQR_COMBA +#undef BN_RECURSION #define RECP_MUL_MOD #define MONT_MUL_MOD @@ -105,6 +108,7 @@ extern "C" { #undef SIXTEEN_BIT #undef EIGHT_BIT + /* assuming long is 64bit - this is the DEC Alpha * unsigned long long is only 64 bits :-(, don't define * BN_LLONG for the DEC Alpha */ @@ -116,17 +120,23 @@ extern "C" { #define BN_BYTES 8 #define BN_BITS2 64 #define BN_BITS4 32 +#define BN_MASK (0xffffffffffffffffffffffffffffffffLL) #define BN_MASK2 (0xffffffffffffffffL) #define BN_MASK2l (0xffffffffL) #define BN_MASK2h (0xffffffff00000000L) #define BN_MASK2h1 (0xffffffff80000000L) #define BN_TBIT (0x8000000000000000L) -#define BN_DEC_CONV (10000000000000000000L) +#define BN_DEC_CONV (10000000000000000000UL) #define BN_DEC_FMT1 "%lu" #define BN_DEC_FMT2 "%019lu" #define BN_DEC_NUM 19 #endif +/* This is where the long long data type is 64 bits, but long is 32. + * For machines where there are 64bit registers, this is the mode to use. + * IRIX, on R4000 and above should use this mode, along with the relevent + * assember code :-). Do NOT define BN_ULLONG. + */ #ifdef SIXTY_FOUR_BIT #undef BN_LLONG /* #define BN_ULLONG unsigned long long */ @@ -141,9 +151,9 @@ extern "C" { #define BN_MASK2h (0xffffffff00000000LL) #define BN_MASK2h1 (0xffffffff80000000LL) #define BN_TBIT (0x8000000000000000LL) -#define BN_DEC_CONV (10000000000000000000L) -#define BN_DEC_FMT1 "%lu" -#define BN_DEC_FMT2 "%019lu" +#define BN_DEC_CONV (10000000000000000000LL) +#define BN_DEC_FMT1 "%llu" +#define BN_DEC_FMT2 "%019llu" #define BN_DEC_NUM 19 #endif @@ -159,6 +169,7 @@ extern "C" { #define BN_BYTES 4 #define BN_BITS2 32 #define BN_BITS4 16 +#define BN_MASK (0xffffffffffffffffLL) #define BN_MASK2 (0xffffffffL) #define BN_MASK2l (0xffff) #define BN_MASK2h1 (0xffff8000L) @@ -181,6 +192,7 @@ extern "C" { #define BN_BYTES 2 #define BN_BITS2 16 #define BN_BITS4 8 +#define BN_MASK (0xffffffff) #define BN_MASK2 (0xffff) #define BN_MASK2l (0xff) #define BN_MASK2h1 (0xff80) @@ -203,6 +215,7 @@ extern "C" { #define BN_BYTES 1 #define BN_BITS2 8 #define BN_BITS4 4 +#define BN_MASK (0xffff) #define BN_MASK2 (0xff) #define BN_MASK2l (0xf) #define BN_MASK2h1 (0xf8) @@ -220,6 +233,12 @@ extern "C" { #undef BIGNUM #endif +#define BN_FLG_MALLOCED 0x01 +#define BN_FLG_STATIC_DATA 0x02 +#define BN_FLG_FREE 0x8000 /* used for debuging */ +#define BN_set_flags(b,n) ((b)->flags|=(n)) +#define BN_get_flags(b,n) ((b)->flags&(n)) + typedef struct bignum_st { BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */ @@ -227,6 +246,7 @@ typedef struct bignum_st /* The next are internal book keeping for bn_expand. */ int max; /* Size of the d array. */ int neg; /* one if the number is negative */ + int flags; } BIGNUM; /* Used for temp variables */ @@ -234,7 +254,8 @@ typedef struct bignum_st typedef struct bignum_ctx { int tos; - BIGNUM *bn[BN_CTX_NUM+1]; + BIGNUM bn[BN_CTX_NUM+1]; + int flags; } BN_CTX; typedef struct bn_blinding_st @@ -248,51 +269,69 @@ typedef struct bn_blinding_st /* Used for montgomery multiplication */ typedef struct bn_mont_ctx_st { + int use_word; /* 0 for word form, 1 for long form */ int ri; /* number of bits in R */ - BIGNUM *RR; /* used to convert to montgomery form */ - BIGNUM *N; /* The modulus */ - BIGNUM *Ni; /* The inverse of N */ + BIGNUM RR; /* used to convert to montgomery form */ + BIGNUM N; /* The modulus */ + BIGNUM Ni; /* The inverse of N */ BN_ULONG n0; /* word form of inverse, normally only one of * Ni or n0 is defined */ + int flags; } BN_MONT_CTX; +/* Used for reciprocal division/mod functions + * It cannot be shared between threads + */ +typedef struct bn_recp_ctx_st + { + BIGNUM N; /* the divisor */ + BIGNUM Nr; /* the reciprocal */ + int num_bits; + int shift; + int flags; + } BN_RECP_CTX; + #define BN_to_montgomery(r,a,mont,ctx) BN_mod_mul_montgomery(\ - r,a,(mont)->RR,(mont),ctx) + r,a,&((mont)->RR),(mont),ctx) #define BN_prime_checks (5) #define BN_num_bytes(a) ((BN_num_bits(a)+7)/8) #define BN_is_word(a,w) (((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) -#define BN_is_zero(a) (((a)->top <= 1) && ((a)->d[0] == (BN_ULONG)0)) +#define BN_is_zero(a) (((a)->top == 0) || BN_is_word(a,0)) #define BN_is_one(a) (BN_is_word((a),1)) -#define BN_is_odd(a) ((a)->d[0] & 1) +#define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1)) #define BN_one(a) (BN_set_word((a),1)) #define BN_zero(a) (BN_set_word((a),0)) -#define BN_ascii2bn(a) BN_hex2bn(a) -#define BN_bn2ascii(a) BN_bn2hex(a) - -#define bn_fix_top(a) \ - { \ - BN_ULONG *fix_top_l; \ - for (fix_top_l= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \ - if (*(fix_top_l--)) break; \ - } +/*#define BN_ascii2bn(a) BN_hex2bn(a) */ +/*#define BN_bn2ascii(a) BN_bn2hex(a) */ -#define bn_expand(n,b) ((((b)/BN_BITS2) <= (n)->max)?\ - (n):bn_expand2((n),(b)/BN_BITS2)) +#define bn_expand(n,b) ((((((b+BN_BITS2-1))/BN_BITS2)) <= (n)->max)?\ + (n):bn_expand2((n),(b)/BN_BITS2+1)) #define bn_wexpand(n,b) (((b) <= (n)->max)?(n):bn_expand2((n),(b))) +#define bn_fix_top(a) \ + { \ + BN_ULONG *ftl; \ + if ((a)->top > 0) \ + { \ + for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \ + if (*(ftl--)) break; \ + } \ + } #ifndef NOPROTO BIGNUM *BN_value_one(void); char * BN_options(void); BN_CTX *BN_CTX_new(void); +void BN_CTX_init(BN_CTX *c); void BN_CTX_free(BN_CTX *c); int BN_rand(BIGNUM *rnd, int bits, int top,int bottom); int BN_num_bits(BIGNUM *a); int BN_num_bits_word(BN_ULONG); BIGNUM *BN_new(void); +void BN_init(BIGNUM *); void BN_clear_free(BIGNUM *a); BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b); BIGNUM *BN_bin2bn(unsigned char *s,int len,BIGNUM *ret); @@ -300,20 +339,20 @@ int BN_bn2bin(BIGNUM *a, unsigned char *to); BIGNUM *BN_mpi2bn(unsigned char *s,int len,BIGNUM *ret); int BN_bn2mpi(BIGNUM *a, unsigned char *to); int BN_sub(BIGNUM *r, BIGNUM *a, BIGNUM *b); -void bn_qsub(BIGNUM *r, BIGNUM *a, BIGNUM *b); -void bn_qadd(BIGNUM *r, BIGNUM *a, BIGNUM *b); +int BN_usub(BIGNUM *r, BIGNUM *a, BIGNUM *b); +int BN_uadd(BIGNUM *r, BIGNUM *a, BIGNUM *b); int BN_add(BIGNUM *r, BIGNUM *a, BIGNUM *b); int BN_mod(BIGNUM *rem, BIGNUM *m, BIGNUM *d, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rem, BIGNUM *m, BIGNUM *d, BN_CTX *ctx); -int BN_mul(BIGNUM *r, BIGNUM *a, BIGNUM *b); +int BN_mul(BIGNUM *r, BIGNUM *a, BIGNUM *b,BN_CTX *ctx); int BN_sqr(BIGNUM *r, BIGNUM *a,BN_CTX *ctx); -BN_ULONG BN_mod_word(BIGNUM *a, unsigned long w); -BN_ULONG BN_div_word(BIGNUM *a, unsigned long w); -int BN_mul_word(BIGNUM *a, unsigned long w); -int BN_add_word(BIGNUM *a, unsigned long w); -int BN_sub_word(BIGNUM *a, unsigned long w); -int BN_set_word(BIGNUM *a, unsigned long w); -unsigned long BN_get_word(BIGNUM *a); +BN_ULONG BN_mod_word(BIGNUM *a, BN_ULONG w); +BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w); +int BN_mul_word(BIGNUM *a, BN_ULONG w); +int BN_add_word(BIGNUM *a, BN_ULONG w); +int BN_sub_word(BIGNUM *a, BN_ULONG w); +int BN_set_word(BIGNUM *a, BN_ULONG w); +BN_ULONG BN_get_word(BIGNUM *a); int BN_cmp(BIGNUM *a, BIGNUM *b); void BN_free(BIGNUM *a); int BN_is_bit_set(BIGNUM *a, int n); @@ -323,12 +362,11 @@ int BN_exp(BIGNUM *r, BIGNUM *a, BIGNUM *p,BN_CTX *ctx); int BN_mod_exp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); int BN_mod_exp_mont(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx, BN_MONT_CTX *m_ctx); -int BN_mod_exp_recp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); +int BN_mod_exp2_mont(BIGNUM *r, BIGNUM *a1, BIGNUM *p1,BIGNUM *a2, + BIGNUM *p2,BIGNUM *m,BN_CTX *ctx,BN_MONT_CTX *m_ctx); int BN_mod_exp_simple(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); int BN_mask_bits(BIGNUM *a,int n); -int BN_mod_mul_reciprocal(BIGNUM *r, BIGNUM *x, BIGNUM *y, BIGNUM *m, - BIGNUM *i, int nb, BN_CTX *ctx); int BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *m, BN_CTX *ctx); #ifndef WIN16 @@ -339,7 +377,7 @@ int BN_print(BIO *fp, BIGNUM *a); #else int BN_print(char *fp, BIGNUM *a); #endif -int BN_reciprocal(BIGNUM *r, BIGNUM *m, BN_CTX *ctx); +int BN_reciprocal(BIGNUM *r, BIGNUM *m, int len, BN_CTX *ctx); int BN_rshift(BIGNUM *r, BIGNUM *a, int n); int BN_rshift1(BIGNUM *r, BIGNUM *a); void BN_clear(BIGNUM *a); @@ -353,8 +391,8 @@ char * BN_bn2dec(BIGNUM *a); int BN_hex2bn(BIGNUM **a,char *str); int BN_dec2bn(BIGNUM **a,char *str); int BN_gcd(BIGNUM *r,BIGNUM *in_a,BIGNUM *in_b,BN_CTX *ctx); -BIGNUM *BN_mod_inverse(BIGNUM *a, BIGNUM *n,BN_CTX *ctx); -BIGNUM *BN_generate_prime(int bits,int strong,BIGNUM *add, +BIGNUM *BN_mod_inverse(BIGNUM *ret,BIGNUM *a, BIGNUM *n,BN_CTX *ctx); +BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int strong,BIGNUM *add, BIGNUM *rem,void (*callback)(int,int,char *),char *cb_arg); int BN_is_prime(BIGNUM *p,int nchecks,void (*callback)(int,int,char *), BN_CTX *ctx,char *cb_arg); @@ -363,15 +401,18 @@ void ERR_load_BN_strings(void ); BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w); void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num); -BN_ULONG bn_div64(BN_ULONG h, BN_ULONG l, BN_ULONG d); +BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d); BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num); +BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num); BN_MONT_CTX *BN_MONT_CTX_new(void ); +void BN_MONT_CTX_init(BN_MONT_CTX *ctx); int BN_mod_mul_montgomery(BIGNUM *r,BIGNUM *a,BIGNUM *b,BN_MONT_CTX *mont, BN_CTX *ctx); int BN_from_montgomery(BIGNUM *r,BIGNUM *a,BN_MONT_CTX *mont,BN_CTX *ctx); void BN_MONT_CTX_free(BN_MONT_CTX *mont); int BN_MONT_CTX_set(BN_MONT_CTX *mont,BIGNUM *modulus,BN_CTX *ctx); +BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from); BN_BLINDING *BN_BLINDING_new(BIGNUM *A,BIGNUM *Ai,BIGNUM *mod); void BN_BLINDING_free(BN_BLINDING *b); @@ -379,16 +420,45 @@ int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx); int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *r, BN_CTX *ctx); int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx); +void BN_set_params(int mul,int high,int low,int mont); +int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */ + +void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb); +void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); +void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); +void bn_sqr_normal(BN_ULONG *r, BN_ULONG *a, int n, BN_ULONG *tmp); +void bn_sqr_comba8(BN_ULONG *r,BN_ULONG *a); +void bn_sqr_comba4(BN_ULONG *r,BN_ULONG *a); +int bn_cmp_words(BN_ULONG *a,BN_ULONG *b,int n); +void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,BN_ULONG *t); +void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, + int tn, int n,BN_ULONG *t); +void bn_sqr_recursive(BN_ULONG *r,BN_ULONG *a, int n2, BN_ULONG *t); +void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n); + +void BN_RECP_CTX_init(BN_RECP_CTX *recp); +BN_RECP_CTX *BN_RECP_CTX_new(void); +void BN_RECP_CTX_free(BN_RECP_CTX *recp); +int BN_RECP_CTX_set(BN_RECP_CTX *recp,BIGNUM *rdiv,BN_CTX *ctx); +int BN_mod_mul_reciprocal(BIGNUM *r, BIGNUM *x, BIGNUM *y, + BN_RECP_CTX *recp,BN_CTX *ctx); +int BN_mod_exp_recp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); +int BN_div_recp(BIGNUM *dv, BIGNUM *rem, BIGNUM *m, + BN_RECP_CTX *recp, BN_CTX *ctx); + + #else BIGNUM *BN_value_one(); char * BN_options(); BN_CTX *BN_CTX_new(); +void BN_CTX_init(); void BN_CTX_free(); int BN_rand(); int BN_num_bits(); int BN_num_bits_word(); BIGNUM *BN_new(); +void BN_init(); void BN_clear_free(); BIGNUM *BN_copy(); BIGNUM *BN_bin2bn(); @@ -396,8 +466,8 @@ int BN_bn2bin(); BIGNUM *BN_mpi2bn(); int BN_bn2mpi(); int BN_sub(); -void bn_qsub(); -void bn_qadd(); +int BN_usub(); +int BN_uadd(); int BN_add(); int BN_mod(); int BN_div(); @@ -449,12 +519,14 @@ void ERR_load_BN_strings(); BN_ULONG bn_mul_add_words(); BN_ULONG bn_mul_words(); void bn_sqr_words(); -BN_ULONG bn_div64(); +BN_ULONG bn_div_words(); BN_ULONG bn_add_words(); +BN_ULONG bn_sub_words(); int BN_mod_mul_montgomery(); int BN_from_montgomery(); BN_MONT_CTX *BN_MONT_CTX_new(); +void BN_MONT_CTX_init(); void BN_MONT_CTX_free(); int BN_MONT_CTX_set(); @@ -464,6 +536,26 @@ int BN_BLINDING_update(); int BN_BLINDING_convert(); int BN_BLINDING_invert(); +void bn_mul_normal(); +void bn_mul_comba8(); +void bn_mul_comba4(); +void bn_sqr_normal(); +void bn_sqr_comba8(); +void bn_sqr_comba4(); +int bn_cmp_words(); +void bn_mul_recursive(); +void bn_mul_part_recursive(); +void bn_sqr_recursive(); +void bn_mul_low_normal(); + +void BN_RECP_CTX_init(); +BN_RECP_CTX *BN_RECP_CTX_new(); +void BN_RECP_CTX_free(); +int BN_RECP_CTX_set(); +int BN_mod_mul_reciprocal(); +int BN_mod_exp_recp(); +int BN_div_recp(); + #endif /* BEGIN ERROR CODES */ @@ -485,15 +577,18 @@ int BN_BLINDING_invert(); #define BN_F_BN_MPI2BN 112 #define BN_F_BN_NEW 113 #define BN_F_BN_RAND 114 +#define BN_F_BN_USUB 115 /* Reason codes. */ -#define BN_R_BAD_RECIPROCAL 100 -#define BN_R_CALLED_WITH_EVEN_MODULUS 101 -#define BN_R_DIV_BY_ZERO 102 -#define BN_R_ENCODING_ERROR 103 -#define BN_R_INVALID_LENGTH 104 -#define BN_R_NOT_INITALISED 105 -#define BN_R_NO_INVERSE 106 +#define BN_R_ARG2_LT_ARG3 100 +#define BN_R_BAD_RECIPROCAL 101 +#define BN_R_CALLED_WITH_EVEN_MODULUS 102 +#define BN_R_DIV_BY_ZERO 103 +#define BN_R_ENCODING_ERROR 104 +#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105 +#define BN_R_INVALID_LENGTH 106 +#define BN_R_NOT_INITALISED 107 +#define BN_R_NO_INVERSE 108 #ifdef __cplusplus } diff --git a/crypto/bn/bn.mul b/crypto/bn/bn.mul new file mode 100644 index 0000000000..9728870d38 --- /dev/null +++ b/crypto/bn/bn.mul @@ -0,0 +1,19 @@ +We need + +* bn_mul_comba8 +* bn_mul_comba4 +* bn_mul_normal +* bn_mul_recursive + +* bn_sqr_comba8 +* bn_sqr_comba4 +bn_sqr_normal -> BN_sqr +* bn_sqr_recursive + +* bn_mul_low_recursive +* bn_mul_low_normal +* bn_mul_high + +* bn_mul_part_recursive # symetric but not power of 2 + +bn_mul_asymetric_recursive # uneven, but do the chop up. diff --git a/crypto/bn/bn.org b/crypto/bn/bn.org index 66dde285d6..d8904d7efa 100644 --- a/crypto/bn/bn.org +++ b/crypto/bn/bn.org @@ -77,6 +77,9 @@ extern "C" { #define BN_LLONG /* This comment stops Configure mutilating things */ #endif +#define BN_MUL_COMBA +#define BN_SQR_COMBA +#define BN_RECURSION #define RECP_MUL_MOD #define MONT_MUL_MOD @@ -105,6 +108,7 @@ extern "C" { #undef SIXTEEN_BIT #undef EIGHT_BIT + /* assuming long is 64bit - this is the DEC Alpha * unsigned long long is only 64 bits :-(, don't define * BN_LLONG for the DEC Alpha */ @@ -116,17 +120,23 @@ extern "C" { #define BN_BYTES 8 #define BN_BITS2 64 #define BN_BITS4 32 +#define BN_MASK (0xffffffffffffffffffffffffffffffffLL) #define BN_MASK2 (0xffffffffffffffffL) #define BN_MASK2l (0xffffffffL) #define BN_MASK2h (0xffffffff00000000L) #define BN_MASK2h1 (0xffffffff80000000L) #define BN_TBIT (0x8000000000000000L) -#define BN_DEC_CONV (10000000000000000000L) +#define BN_DEC_CONV (10000000000000000000UL) #define BN_DEC_FMT1 "%lu" #define BN_DEC_FMT2 "%019lu" #define BN_DEC_NUM 19 #endif +/* This is where the long long data type is 64 bits, but long is 32. + * For machines where there are 64bit registers, this is the mode to use. + * IRIX, on R4000 and above should use this mode, along with the relevent + * assember code :-). Do NOT define BN_ULLONG. + */ #ifdef SIXTY_FOUR_BIT #undef BN_LLONG /* #define BN_ULLONG unsigned long long */ @@ -141,9 +151,9 @@ extern "C" { #define BN_MASK2h (0xffffffff00000000LL) #define BN_MASK2h1 (0xffffffff80000000LL) #define BN_TBIT (0x8000000000000000LL) -#define BN_DEC_CONV (10000000000000000000L) -#define BN_DEC_FMT1 "%lu" -#define BN_DEC_FMT2 "%019lu" +#define BN_DEC_CONV (10000000000000000000LL) +#define BN_DEC_FMT1 "%llu" +#define BN_DEC_FMT2 "%019llu" #define BN_DEC_NUM 19 #endif @@ -159,6 +169,7 @@ extern "C" { #define BN_BYTES 4 #define BN_BITS2 32 #define BN_BITS4 16 +#define BN_MASK (0xffffffffffffffffLL) #define BN_MASK2 (0xffffffffL) #define BN_MASK2l (0xffff) #define BN_MASK2h1 (0xffff8000L) @@ -181,6 +192,7 @@ extern "C" { #define BN_BYTES 2 #define BN_BITS2 16 #define BN_BITS4 8 +#define BN_MASK (0xffffffff) #define BN_MASK2 (0xffff) #define BN_MASK2l (0xff) #define BN_MASK2h1 (0xff80) @@ -203,6 +215,7 @@ extern "C" { #define BN_BYTES 1 #define BN_BITS2 8 #define BN_BITS4 4 +#define BN_MASK (0xffff) #define BN_MASK2 (0xff) #define BN_MASK2l (0xf) #define BN_MASK2h1 (0xf8) @@ -220,6 +233,12 @@ extern "C" { #undef BIGNUM #endif +#define BN_FLG_MALLOCED 0x01 +#define BN_FLG_STATIC_DATA 0x02 +#define BN_FLG_FREE 0x8000 /* used for debuging */ +#define BN_set_flags(b,n) ((b)->flags|=(n)) +#define BN_get_flags(b,n) ((b)->flags&(n)) + typedef struct bignum_st { BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */ @@ -227,6 +246,7 @@ typedef struct bignum_st /* The next are internal book keeping for bn_expand. */ int max; /* Size of the d array. */ int neg; /* one if the number is negative */ + int flags; } BIGNUM; /* Used for temp variables */ @@ -234,7 +254,8 @@ typedef struct bignum_st typedef struct bignum_ctx { int tos; - BIGNUM *bn[BN_CTX_NUM+1]; + BIGNUM bn[BN_CTX_NUM+1]; + int flags; } BN_CTX; typedef struct bn_blinding_st @@ -248,51 +269,69 @@ typedef struct bn_blinding_st /* Used for montgomery multiplication */ typedef struct bn_mont_ctx_st { + int use_word; /* 0 for word form, 1 for long form */ int ri; /* number of bits in R */ - BIGNUM *RR; /* used to convert to montgomery form */ - BIGNUM *N; /* The modulus */ - BIGNUM *Ni; /* The inverse of N */ + BIGNUM RR; /* used to convert to montgomery form */ + BIGNUM N; /* The modulus */ + BIGNUM Ni; /* The inverse of N */ BN_ULONG n0; /* word form of inverse, normally only one of * Ni or n0 is defined */ + int flags; } BN_MONT_CTX; +/* Used for reciprocal division/mod functions + * It cannot be shared between threads + */ +typedef struct bn_recp_ctx_st + { + BIGNUM N; /* the divisor */ + BIGNUM Nr; /* the reciprocal */ + int num_bits; + int shift; + int flags; + } BN_RECP_CTX; + #define BN_to_montgomery(r,a,mont,ctx) BN_mod_mul_montgomery(\ - r,a,(mont)->RR,(mont),ctx) + r,a,&((mont)->RR),(mont),ctx) #define BN_prime_checks (5) #define BN_num_bytes(a) ((BN_num_bits(a)+7)/8) #define BN_is_word(a,w) (((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) -#define BN_is_zero(a) (((a)->top <= 1) && ((a)->d[0] == (BN_ULONG)0)) +#define BN_is_zero(a) (((a)->top == 0) || BN_is_word(a,0)) #define BN_is_one(a) (BN_is_word((a),1)) -#define BN_is_odd(a) ((a)->d[0] & 1) +#define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1)) #define BN_one(a) (BN_set_word((a),1)) #define BN_zero(a) (BN_set_word((a),0)) -#define BN_ascii2bn(a) BN_hex2bn(a) -#define BN_bn2ascii(a) BN_bn2hex(a) - -#define bn_fix_top(a) \ - { \ - BN_ULONG *fix_top_l; \ - for (fix_top_l= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \ - if (*(fix_top_l--)) break; \ - } +/*#define BN_ascii2bn(a) BN_hex2bn(a) */ +/*#define BN_bn2ascii(a) BN_bn2hex(a) */ -#define bn_expand(n,b) ((((b)/BN_BITS2) <= (n)->max)?\ - (n):bn_expand2((n),(b)/BN_BITS2)) +#define bn_expand(n,b) ((((((b+BN_BITS2-1))/BN_BITS2)) <= (n)->max)?\ + (n):bn_expand2((n),(b)/BN_BITS2+1)) #define bn_wexpand(n,b) (((b) <= (n)->max)?(n):bn_expand2((n),(b))) +#define bn_fix_top(a) \ + { \ + BN_ULONG *ftl; \ + if ((a)->top > 0) \ + { \ + for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \ + if (*(ftl--)) break; \ + } \ + } #ifndef NOPROTO BIGNUM *BN_value_one(void); char * BN_options(void); BN_CTX *BN_CTX_new(void); +void BN_CTX_init(BN_CTX *c); void BN_CTX_free(BN_CTX *c); int BN_rand(BIGNUM *rnd, int bits, int top,int bottom); int BN_num_bits(BIGNUM *a); int BN_num_bits_word(BN_ULONG); BIGNUM *BN_new(void); +void BN_init(BIGNUM *); void BN_clear_free(BIGNUM *a); BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b); BIGNUM *BN_bin2bn(unsigned char *s,int len,BIGNUM *ret); @@ -300,20 +339,20 @@ int BN_bn2bin(BIGNUM *a, unsigned char *to); BIGNUM *BN_mpi2bn(unsigned char *s,int len,BIGNUM *ret); int BN_bn2mpi(BIGNUM *a, unsigned char *to); int BN_sub(BIGNUM *r, BIGNUM *a, BIGNUM *b); -void bn_qsub(BIGNUM *r, BIGNUM *a, BIGNUM *b); -void bn_qadd(BIGNUM *r, BIGNUM *a, BIGNUM *b); +int BN_usub(BIGNUM *r, BIGNUM *a, BIGNUM *b); +int BN_uadd(BIGNUM *r, BIGNUM *a, BIGNUM *b); int BN_add(BIGNUM *r, BIGNUM *a, BIGNUM *b); int BN_mod(BIGNUM *rem, BIGNUM *m, BIGNUM *d, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rem, BIGNUM *m, BIGNUM *d, BN_CTX *ctx); -int BN_mul(BIGNUM *r, BIGNUM *a, BIGNUM *b); +int BN_mul(BIGNUM *r, BIGNUM *a, BIGNUM *b,BN_CTX *ctx); int BN_sqr(BIGNUM *r, BIGNUM *a,BN_CTX *ctx); -BN_ULONG BN_mod_word(BIGNUM *a, unsigned long w); -BN_ULONG BN_div_word(BIGNUM *a, unsigned long w); -int BN_mul_word(BIGNUM *a, unsigned long w); -int BN_add_word(BIGNUM *a, unsigned long w); -int BN_sub_word(BIGNUM *a, unsigned long w); -int BN_set_word(BIGNUM *a, unsigned long w); -unsigned long BN_get_word(BIGNUM *a); +BN_ULONG BN_mod_word(BIGNUM *a, BN_ULONG w); +BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w); +int BN_mul_word(BIGNUM *a, BN_ULONG w); +int BN_add_word(BIGNUM *a, BN_ULONG w); +int BN_sub_word(BIGNUM *a, BN_ULONG w); +int BN_set_word(BIGNUM *a, BN_ULONG w); +BN_ULONG BN_get_word(BIGNUM *a); int BN_cmp(BIGNUM *a, BIGNUM *b); void BN_free(BIGNUM *a); int BN_is_bit_set(BIGNUM *a, int n); @@ -323,12 +362,11 @@ int BN_exp(BIGNUM *r, BIGNUM *a, BIGNUM *p,BN_CTX *ctx); int BN_mod_exp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); int BN_mod_exp_mont(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx, BN_MONT_CTX *m_ctx); -int BN_mod_exp_recp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); +int BN_mod_exp2_mont(BIGNUM *r, BIGNUM *a1, BIGNUM *p1,BIGNUM *a2, + BIGNUM *p2,BIGNUM *m,BN_CTX *ctx,BN_MONT_CTX *m_ctx); int BN_mod_exp_simple(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); int BN_mask_bits(BIGNUM *a,int n); -int BN_mod_mul_reciprocal(BIGNUM *r, BIGNUM *x, BIGNUM *y, BIGNUM *m, - BIGNUM *i, int nb, BN_CTX *ctx); int BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *m, BN_CTX *ctx); #ifndef WIN16 @@ -339,7 +377,7 @@ int BN_print(BIO *fp, BIGNUM *a); #else int BN_print(char *fp, BIGNUM *a); #endif -int BN_reciprocal(BIGNUM *r, BIGNUM *m, BN_CTX *ctx); +int BN_reciprocal(BIGNUM *r, BIGNUM *m, int len, BN_CTX *ctx); int BN_rshift(BIGNUM *r, BIGNUM *a, int n); int BN_rshift1(BIGNUM *r, BIGNUM *a); void BN_clear(BIGNUM *a); @@ -353,8 +391,8 @@ char * BN_bn2dec(BIGNUM *a); int BN_hex2bn(BIGNUM **a,char *str); int BN_dec2bn(BIGNUM **a,char *str); int BN_gcd(BIGNUM *r,BIGNUM *in_a,BIGNUM *in_b,BN_CTX *ctx); -BIGNUM *BN_mod_inverse(BIGNUM *a, BIGNUM *n,BN_CTX *ctx); -BIGNUM *BN_generate_prime(int bits,int strong,BIGNUM *add, +BIGNUM *BN_mod_inverse(BIGNUM *ret,BIGNUM *a, BIGNUM *n,BN_CTX *ctx); +BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int strong,BIGNUM *add, BIGNUM *rem,void (*callback)(int,int,char *),char *cb_arg); int BN_is_prime(BIGNUM *p,int nchecks,void (*callback)(int,int,char *), BN_CTX *ctx,char *cb_arg); @@ -363,15 +401,18 @@ void ERR_load_BN_strings(void ); BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w); void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num); -BN_ULONG bn_div64(BN_ULONG h, BN_ULONG l, BN_ULONG d); +BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d); BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num); +BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num); BN_MONT_CTX *BN_MONT_CTX_new(void ); +void BN_MONT_CTX_init(BN_MONT_CTX *ctx); int BN_mod_mul_montgomery(BIGNUM *r,BIGNUM *a,BIGNUM *b,BN_MONT_CTX *mont, BN_CTX *ctx); int BN_from_montgomery(BIGNUM *r,BIGNUM *a,BN_MONT_CTX *mont,BN_CTX *ctx); void BN_MONT_CTX_free(BN_MONT_CTX *mont); int BN_MONT_CTX_set(BN_MONT_CTX *mont,BIGNUM *modulus,BN_CTX *ctx); +BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from); BN_BLINDING *BN_BLINDING_new(BIGNUM *A,BIGNUM *Ai,BIGNUM *mod); void BN_BLINDING_free(BN_BLINDING *b); @@ -379,16 +420,45 @@ int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx); int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *r, BN_CTX *ctx); int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx); +void BN_set_params(int mul,int high,int low,int mont); +int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */ + +void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb); +void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); +void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); +void bn_sqr_normal(BN_ULONG *r, BN_ULONG *a, int n, BN_ULONG *tmp); +void bn_sqr_comba8(BN_ULONG *r,BN_ULONG *a); +void bn_sqr_comba4(BN_ULONG *r,BN_ULONG *a); +int bn_cmp_words(BN_ULONG *a,BN_ULONG *b,int n); +void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,BN_ULONG *t); +void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, + int tn, int n,BN_ULONG *t); +void bn_sqr_recursive(BN_ULONG *r,BN_ULONG *a, int n2, BN_ULONG *t); +void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n); + +void BN_RECP_CTX_init(BN_RECP_CTX *recp); +BN_RECP_CTX *BN_RECP_CTX_new(void); +void BN_RECP_CTX_free(BN_RECP_CTX *recp); +int BN_RECP_CTX_set(BN_RECP_CTX *recp,BIGNUM *rdiv,BN_CTX *ctx); +int BN_mod_mul_reciprocal(BIGNUM *r, BIGNUM *x, BIGNUM *y, + BN_RECP_CTX *recp,BN_CTX *ctx); +int BN_mod_exp_recp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,BN_CTX *ctx); +int BN_div_recp(BIGNUM *dv, BIGNUM *rem, BIGNUM *m, + BN_RECP_CTX *recp, BN_CTX *ctx); + + #else BIGNUM *BN_value_one(); char * BN_options(); BN_CTX *BN_CTX_new(); +void BN_CTX_init(); void BN_CTX_free(); int BN_rand(); int BN_num_bits(); int BN_num_bits_word(); BIGNUM *BN_new(); +void BN_init(); void BN_clear_free(); BIGNUM *BN_copy(); BIGNUM *BN_bin2bn(); @@ -396,8 +466,8 @@ int BN_bn2bin(); BIGNUM *BN_mpi2bn(); int BN_bn2mpi(); int BN_sub(); -void bn_qsub(); -void bn_qadd(); +int BN_usub(); +int BN_uadd(); int BN_add(); int BN_mod(); int BN_div(); @@ -449,12 +519,14 @@ void ERR_load_BN_strings(); BN_ULONG bn_mul_add_words(); BN_ULONG bn_mul_words(); void bn_sqr_words(); -BN_ULONG bn_div64(); +BN_ULONG bn_div_words(); BN_ULONG bn_add_words(); +BN_ULONG bn_sub_words(); int BN_mod_mul_montgomery(); int BN_from_montgomery(); BN_MONT_CTX *BN_MONT_CTX_new(); +void BN_MONT_CTX_init(); void BN_MONT_CTX_free(); int BN_MONT_CTX_set(); @@ -464,6 +536,26 @@ int BN_BLINDING_update(); int BN_BLINDING_convert(); int BN_BLINDING_invert(); +void bn_mul_normal(); +void bn_mul_comba8(); +void bn_mul_comba4(); +void bn_sqr_normal(); +void bn_sqr_comba8(); +void bn_sqr_comba4(); +int bn_cmp_words(); +void bn_mul_recursive(); +void bn_mul_part_recursive(); +void bn_sqr_recursive(); +void bn_mul_low_normal(); + +void BN_RECP_CTX_init(); +BN_RECP_CTX *BN_RECP_CTX_new(); +void BN_RECP_CTX_free(); +int BN_RECP_CTX_set(); +int BN_mod_mul_reciprocal(); +int BN_mod_exp_recp(); +int BN_div_recp(); + #endif /* BEGIN ERROR CODES */ @@ -485,15 +577,18 @@ int BN_BLINDING_invert(); #define BN_F_BN_MPI2BN 112 #define BN_F_BN_NEW 113 #define BN_F_BN_RAND 114 +#define BN_F_BN_USUB 115 /* Reason codes. */ -#define BN_R_BAD_RECIPROCAL 100 -#define BN_R_CALLED_WITH_EVEN_MODULUS 101 -#define BN_R_DIV_BY_ZERO 102 -#define BN_R_ENCODING_ERROR 103 -#define BN_R_INVALID_LENGTH 104 -#define BN_R_NOT_INITALISED 105 -#define BN_R_NO_INVERSE 106 +#define BN_R_ARG2_LT_ARG3 100 +#define BN_R_BAD_RECIPROCAL 101 +#define BN_R_CALLED_WITH_EVEN_MODULUS 102 +#define BN_R_DIV_BY_ZERO 103 +#define BN_R_ENCODING_ERROR 104 +#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105 +#define BN_R_INVALID_LENGTH 106 +#define BN_R_NOT_INITALISED 107 +#define BN_R_NO_INVERSE 108 #ifdef __cplusplus } diff --git a/crypto/bn/bn_add.c b/crypto/bn/bn_add.c index efb2e312e8..27b781a367 100644 --- a/crypto/bn/bn_add.c +++ b/crypto/bn/bn_add.c @@ -66,9 +66,11 @@ BIGNUM *r; BIGNUM *a; BIGNUM *b; { - int i; BIGNUM *tmp; + bn_check_top(a); + bn_check_top(b); + /* a + b a+b * a + -b a-b * -a + b b-a @@ -84,14 +86,12 @@ BIGNUM *b; if (BN_ucmp(a,b) < 0) { - if (bn_wexpand(r,b->top) == NULL) return(0); - bn_qsub(r,b,a); + if (!BN_usub(r,b,a)) return(0); r->neg=1; } else { - if (bn_wexpand(r,a->top) == NULL) return(0); - bn_qsub(r,a,b); + if (!BN_usub(r,a,b)) return(0); r->neg=0; } return(1); @@ -102,23 +102,12 @@ BIGNUM *b; else r->neg=0; - i=(a->top > b->top); - - if (i) - { - if (bn_wexpand(r,a->top+1) == NULL) return(0); - bn_qadd(r,a,b); - } - else - { - if (bn_wexpand(r,b->top+1) == NULL) return(0); - bn_qadd(r,b,a); - } + if (!BN_uadd(r,a,b)) return(0); return(1); } /* unsigned add of b to a, r must be large enough */ -void bn_qadd(r,a,b) +int BN_uadd(r,a,b) BIGNUM *r; BIGNUM *a; BIGNUM *b; @@ -126,11 +115,22 @@ BIGNUM *b; register int i; int max,min; BN_ULONG *ap,*bp,*rp,carry,t1; + BIGNUM *tmp; + + bn_check_top(a); + bn_check_top(b); + if (a->top < b->top) + { tmp=a; a=b; b=tmp; } max=a->top; min=b->top; + + if (bn_wexpand(r,max+1) == NULL) + return(0); + r->top=max; + ap=a->d; bp=b->d; rp=r->d; @@ -160,8 +160,160 @@ BIGNUM *b; r->top++; } } - for (; i<max; i++) - *(rp++)= *(ap++); + if (rp != ap) + { + for (; i<max; i++) + *(rp++)= *(ap++); + } /* memcpy(rp,ap,sizeof(*ap)*(max-i));*/ + return(1); + } + +/* unsigned subtraction of b from a, a must be larger than b. */ +int BN_usub(r, a, b) +BIGNUM *r; +BIGNUM *a; +BIGNUM *b; + { + int max,min,ret=1; + register BN_ULONG t1,t2,*ap,*bp,*rp; + int i,carry; +#if defined(IRIX_CC_BUG) && !defined(LINT) + int dummy; +#endif + + bn_check_top(a); + bn_check_top(b); + + if (a->top < b->top) /* hmm... should not be happening */ + { + BNerr(BN_F_BN_USUB,BN_R_ARG2_LT_ARG3); + return(0); + } + + max=a->top; + min=b->top; + if (bn_wexpand(r,max) == NULL) return(0); + + ap=a->d; + bp=b->d; + rp=r->d; + +#if 1 + carry=0; + for (i=0; i<min; i++) + { + t1= *(ap++); + t2= *(bp++); + if (carry) + { + carry=(t1 <= t2); + t1=(t1-t2-1)&BN_MASK2; + } + else + { + carry=(t1 < t2); + t1=(t1-t2)&BN_MASK2; + } +#if defined(IRIX_CC_BUG) && !defined(LINT) + dummy=t1; +#endif + *(rp++)=t1&BN_MASK2; + } +#else + carry=bn_sub_words(rp,ap,bp,min); + ap+=min; + bp+=min; + rp+=min; + i=min; +#endif + if (carry) /* subtracted */ + { + while (i < max) + { + i++; + t1= *(ap++); + t2=(t1-1)&BN_MASK2; + *(rp++)=t2; + if (t1 > t2) break; + } + } +#if 0 + memcpy(rp,ap,sizeof(*rp)*(max-i)); +#else + if (rp != ap) + { + for (;;) + { + if (i++ >= max) break; + rp[0]=ap[0]; + if (i++ >= max) break; + rp[1]=ap[1]; + if (i++ >= max) break; + rp[2]=ap[2]; + if (i++ >= max) break; + rp[3]=ap[3]; + rp+=4; + ap+=4; + } + } +#endif + + r->top=max; + bn_fix_top(r); + return(1); + } + +int BN_sub(r, a, b) +BIGNUM *r; +BIGNUM *a; +BIGNUM *b; + { + int max; + int add=0,neg=0; + BIGNUM *tmp; + + bn_check_top(a); + bn_check_top(b); + + /* a - b a-b + * a - -b a+b + * -a - b -(a+b) + * -a - -b b-a + */ + if (a->neg) + { + if (b->neg) + { tmp=a; a=b; b=tmp; } + else + { add=1; neg=1; } + } + else + { + if (b->neg) { add=1; neg=0; } + } + + if (add) + { + if (!BN_uadd(r,a,b)) return(0); + r->neg=neg; + return(1); + } + + /* We are actually doing a - b :-) */ + + max=(a->top > b->top)?a->top:b->top; + if (bn_wexpand(r,max) == NULL) return(0); + if (BN_ucmp(a,b) < 0) + { + if (!BN_usub(r,b,a)) return(0); + r->neg=1; + } + else + { + if (!BN_usub(r,a,b)) return(0); + r->neg=0; + } + return(1); } diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c new file mode 100644 index 0000000000..c9eb0e9d05 --- /dev/null +++ b/crypto/bn/bn_asm.c @@ -0,0 +1,829 @@ +/* crypto/bn/bn_asm.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +#ifdef BN_LLONG + +BN_ULONG bn_mul_add_words(rp,ap,num,w) +BN_ULONG *rp,*ap; +int num; +BN_ULONG w; + { + BN_ULONG c1=0; + + bn_check_num(num); + if (num <= 0) return(c1); + + for (;;) + { + mul_add(rp[0],ap[0],w,c1); + if (--num == 0) break; + mul_add(rp[1],ap[1],w,c1); + if (--num == 0) break; + mul_add(rp[2],ap[2],w,c1); + if (--num == 0) break; + mul_add(rp[3],ap[3],w,c1); + if (--num == 0) break; + ap+=4; + rp+=4; + } + + return(c1); + } + +BN_ULONG bn_mul_words(rp,ap,num,w) +BN_ULONG *rp,*ap; +int num; +BN_ULONG w; + { + BN_ULONG c1=0; + + bn_check_num(num); + if (num <= 0) return(c1); + + for (;;) + { + mul(rp[0],ap[0],w,c1); + if (--num == 0) break; + mul(rp[1],ap[1],w,c1); + if (--num == 0) break; + mul(rp[2],ap[2],w,c1); + if (--num == 0) break; + mul(rp[3],ap[3],w,c1); + if (--num == 0) break; + ap+=4; + rp+=4; + } + return(c1); + } + +void bn_sqr_words(r,a,n) +BN_ULONG *r,*a; +int n; + { + bn_check_num(n); + if (n <= 0) return; + for (;;) + { + BN_ULLONG t; + + t=(BN_ULLONG)(a[0])*(a[0]); + r[0]=Lw(t); r[1]=Hw(t); + if (--n == 0) break; + + t=(BN_ULLONG)(a[1])*(a[1]); + r[2]=Lw(t); r[3]=Hw(t); + if (--n == 0) break; + + t=(BN_ULLONG)(a[2])*(a[2]); + r[4]=Lw(t); r[5]=Hw(t); + if (--n == 0) break; + + t=(BN_ULLONG)(a[3])*(a[3]); + r[6]=Lw(t); r[7]=Hw(t); + if (--n == 0) break; + + a+=4; + r+=8; + } + } + +#else + +BN_ULONG bn_mul_add_words(rp,ap,num,w) +BN_ULONG *rp,*ap; +int num; +BN_ULONG w; + { + BN_ULONG c=0; + BN_ULONG bl,bh; + + bn_check_num(num); + if (num <= 0) return((BN_ULONG)0); + + bl=LBITS(w); + bh=HBITS(w); + + for (;;) + { + mul_add(rp[0],ap[0],bl,bh,c); + if (--num == 0) break; + mul_add(rp[1],ap[1],bl,bh,c); + if (--num == 0) break; + mul_add(rp[2],ap[2],bl,bh,c); + if (--num == 0) break; + mul_add(rp[3],ap[3],bl,bh,c); + if (--num == 0) break; + ap+=4; + rp+=4; + } + return(c); + } + +BN_ULONG bn_mul_words(rp,ap,num,w) +BN_ULONG *rp,*ap; +int num; +BN_ULONG w; + { + BN_ULONG carry=0; + BN_ULONG bl,bh; + + bn_check_num(num); + if (num <= 0) return((BN_ULONG)0); + + bl=LBITS(w); + bh=HBITS(w); + + for (;;) + { + mul(rp[0],ap[0],bl,bh,carry); + if (--num == 0) break; + mul(rp[1],ap[1],bl,bh,carry); + if (--num == 0) break; + mul(rp[2],ap[2],bl,bh,carry); + if (--num == 0) break; + mul(rp[3],ap[3],bl,bh,carry); + if (--num == 0) break; + ap+=4; + rp+=4; + } + return(carry); + } + +void bn_sqr_words(r,a,n) +BN_ULONG *r,*a; +int n; + { + bn_check_num(n); + if (n <= 0) return; + for (;;) + { + sqr64(r[0],r[1],a[0]); + if (--n == 0) break; + + sqr64(r[2],r[3],a[1]); + if (--n == 0) break; + + sqr64(r[4],r[5],a[2]); + if (--n == 0) break; + + sqr64(r[6],r[7],a[3]); + if (--n == 0) break; + + a+=4; + r+=8; + } + } + +#endif + +#if defined(BN_LLONG) && defined(BN_DIV2W) + +BN_ULONG bn_div_words(h,l,d) +BN_ULONG h,l,d; + { + return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); + } + +#else + +/* Divide h-l by d and return the result. */ +/* I need to test this some more :-( */ +BN_ULONG bn_div_words(h,l,d) +BN_ULONG h,l,d; + { + BN_ULONG dh,dl,q,ret=0,th,tl,t; + int i,count=2; + + if (d == 0) return(BN_MASK2); + + i=BN_num_bits_word(d); + if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i)) + { +#if !defined(NO_STDIO) && !defined(WIN16) + fprintf(stderr,"Division would overflow (%d)\n",i); +#endif + abort(); + } + i=BN_BITS2-i; + if (h >= d) h-=d; + + if (i) + { + d<<=i; + h=(h<<i)|(l>>(BN_BITS2-i)); + l<<=i; + } + dh=(d&BN_MASK2h)>>BN_BITS4; + dl=(d&BN_MASK2l); + for (;;) + { + if ((h>>BN_BITS4) == dh) + q=BN_MASK2l; + else + q=h/dh; + + for (;;) + { + t=(h-q*dh); + if ((t&BN_MASK2h) || + ((dl*q) <= ( + (t<<BN_BITS4)+ + ((l&BN_MASK2h)>>BN_BITS4)))) + break; + q--; + } + th=q*dh; + tl=q*dl; + t=(tl>>BN_BITS4); + tl=(tl<<BN_BITS4)&BN_MASK2h; + th+=t; + + if (l < tl) th++; + l-=tl; + if (h < th) + { + h+=d; + q--; + } + h-=th; + + if (--count == 0) break; + + ret=q<<BN_BITS4; + h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; + l=(l&BN_MASK2l)<<BN_BITS4; + } + ret|=q; + return(ret); + } +#endif + +#ifdef BN_LLONG +BN_ULONG bn_add_words(r,a,b,n) +BN_ULONG *r,*a,*b; +int n; + { + BN_ULLONG ll=0; + + bn_check_num(n); + if (n <= 0) return((BN_ULONG)0); + + for (;;) + { + ll+=(BN_ULLONG)a[0]+b[0]; + r[0]=(BN_ULONG)ll&BN_MASK2; + ll>>=BN_BITS2; + if (--n <= 0) break; + + ll+=(BN_ULLONG)a[1]+b[1]; + r[1]=(BN_ULONG)ll&BN_MASK2; + ll>>=BN_BITS2; + if (--n <= 0) break; + + ll+=(BN_ULLONG)a[2]+b[2]; + r[2]=(BN_ULONG)ll&BN_MASK2; + ll>>=BN_BITS2; + if (--n <= 0) break; + + ll+=(BN_ULLONG)a[3]+b[3]; + r[3]=(BN_ULONG)ll&BN_MASK2; + ll>>=BN_BITS2; + if (--n <= 0) break; + + a+=4; + b+=4; + r+=4; + } + return((BN_ULONG)ll); + } +#else +BN_ULONG bn_add_words(r,a,b,n) +BN_ULONG *r,*a,*b; +int n; + { + BN_ULONG c,l,t; + + bn_check_num(n); + if (n <= 0) return((BN_ULONG)0); + + c=0; + for (;;) + { + t=a[0]; + t=(t+c)&BN_MASK2; + c=(t < c); + l=(t+b[0])&BN_MASK2; + c+=(l < t); + r[0]=l; + if (--n <= 0) break; + + t=a[1]; + t=(t+c)&BN_MASK2; + c=(t < c); + l=(t+b[1])&BN_MASK2; + c+=(l < t); + r[1]=l; + if (--n <= 0) break; + + t=a[2]; + t=(t+c)&BN_MASK2; + c=(t < c); + l=(t+b[2])&BN_MASK2; + c+=(l < t); + r[2]=l; + if (--n <= 0) break; + + t=a[3]; + t=(t+c)&BN_MASK2; + c=(t < c); + l=(t+b[3])&BN_MASK2; + c+=(l < t); + r[3]=l; + if (--n <= 0) break; + + a+=4; + b+=4; + r+=4; + } + return((BN_ULONG)c); + } +#endif + +BN_ULONG bn_sub_words(r,a,b,n) +BN_ULONG *r,*a,*b; +int n; + { + BN_ULONG t1,t2; + int c=0; + + bn_check_num(n); + if (n <= 0) return((BN_ULONG)0); + + for (;;) + { + t1=a[0]; t2=b[0]; + r[0]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + t1=a[1]; t2=b[1]; + r[1]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + t1=a[2]; t2=b[2]; + r[2]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + t1=a[3]; t2=b[3]; + r[3]=(t1-t2-c)&BN_MASK2; + if (t1 != t2) c=(t1 < t2); + if (--n <= 0) break; + + a+=4; + b+=4; + r+=4; + } + return(c); + } + +#ifdef BN_COMBA + +#undef bn_mul_comba8 +#undef bn_mul_comba4 +#undef bn_sqr_comba8 +#undef bn_sqr_comba4 + +#ifdef BN_LLONG +#define mul_add_c(a,b,c0,c1,c2) \ + t=(BN_ULLONG)a*b; \ + t1=(BN_ULONG)Lw(t); \ + t2=(BN_ULONG)Hw(t); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define mul_add_c2(a,b,c0,c1,c2) \ + t=(BN_ULLONG)a*b; \ + tt=(t+t)&BN_MASK; \ + if (tt < t) c2++; \ + t1=(BN_ULONG)Lw(tt); \ + t2=(BN_ULONG)Hw(tt); \ + c0=(c0+t1)&BN_MASK2; \ + if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c(a,i,c0,c1,c2) \ + t=(BN_ULLONG)a[i]*a[i]; \ + t1=(BN_ULONG)Lw(t); \ + t2=(BN_ULONG)Hw(t); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c2(a,i,j,c0,c1,c2) \ + mul_add_c2((a)[i],(a)[j],c0,c1,c2) +#else +#define mul_add_c(a,b,c0,c1,c2) \ + t1=LBITS(a); t2=HBITS(a); \ + bl=LBITS(b); bh=HBITS(b); \ + mul64(t1,t2,bl,bh); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define mul_add_c2(a,b,c0,c1,c2) \ + t1=LBITS(a); t2=HBITS(a); \ + bl=LBITS(b); bh=HBITS(b); \ + mul64(t1,t2,bl,bh); \ + if (t2 & BN_TBIT) c2++; \ + t2=(t2+t2)&BN_MASK2; \ + if (t1 & BN_TBIT) t2++; \ + t1=(t1+t1)&BN_MASK2; \ + c0=(c0+t1)&BN_MASK2; \ + if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c(a,i,c0,c1,c2) \ + sqr64(t1,t2,(a)[i]); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c2(a,i,j,c0,c1,c2) \ + mul_add_c2((a)[i],(a)[j],c0,c1,c2) +#endif + +void bn_mul_comba8(r,a,b) +BN_ULONG *r,*a,*b; + { +#ifdef BN_LLONG + BN_ULLONG t; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + mul_add_c(a[0],b[0],c1,c2,c3); + r[0]=c1; + c1=0; + mul_add_c(a[0],b[1],c2,c3,c1); + mul_add_c(a[1],b[0],c2,c3,c1); + r[1]=c2; + c2=0; + mul_add_c(a[2],b[0],c3,c1,c2); + mul_add_c(a[1],b[1],c3,c1,c2); + mul_add_c(a[0],b[2],c3,c1,c2); + r[2]=c3; + c3=0; + mul_add_c(a[0],b[3],c1,c2,c3); + mul_add_c(a[1],b[2],c1,c2,c3); + mul_add_c(a[2],b[1],c1,c2,c3); + mul_add_c(a[3],b[0],c1,c2,c3); + r[3]=c1; + c1=0; + mul_add_c(a[4],b[0],c2,c3,c1); + mul_add_c(a[3],b[1],c2,c3,c1); + mul_add_c(a[2],b[2],c2,c3,c1); + mul_add_c(a[1],b[3],c2,c3,c1); + mul_add_c(a[0],b[4],c2,c3,c1); + r[4]=c2; + c2=0; + mul_add_c(a[0],b[5],c3,c1,c2); + mul_add_c(a[1],b[4],c3,c1,c2); + mul_add_c(a[2],b[3],c3,c1,c2); + mul_add_c(a[3],b[2],c3,c1,c2); + mul_add_c(a[4],b[1],c3,c1,c2); + mul_add_c(a[5],b[0],c3,c1,c2); + r[5]=c3; + c3=0; + mul_add_c(a[6],b[0],c1,c2,c3); + mul_add_c(a[5],b[1],c1,c2,c3); + mul_add_c(a[4],b[2],c1,c2,c3); + mul_add_c(a[3],b[3],c1,c2,c3); + mul_add_c(a[2],b[4],c1,c2,c3); + mul_add_c(a[1],b[5],c1,c2,c3); + mul_add_c(a[0],b[6],c1,c2,c3); + r[6]=c1; + c1=0; + mul_add_c(a[0],b[7],c2,c3,c1); + mul_add_c(a[1],b[6],c2,c3,c1); + mul_add_c(a[2],b[5],c2,c3,c1); + mul_add_c(a[3],b[4],c2,c3,c1); + mul_add_c(a[4],b[3],c2,c3,c1); + mul_add_c(a[5],b[2],c2,c3,c1); + mul_add_c(a[6],b[1],c2,c3,c1); + mul_add_c(a[7],b[0],c2,c3,c1); + r[7]=c2; + c2=0; + mul_add_c(a[7],b[1],c3,c1,c2); + mul_add_c(a[6],b[2],c3,c1,c2); + mul_add_c(a[5],b[3],c3,c1,c2); + mul_add_c(a[4],b[4],c3,c1,c2); + mul_add_c(a[3],b[5],c3,c1,c2); + mul_add_c(a[2],b[6],c3,c1,c2); + mul_add_c(a[1],b[7],c3,c1,c2); + r[8]=c3; + c3=0; + mul_add_c(a[2],b[7],c1,c2,c3); + mul_add_c(a[3],b[6],c1,c2,c3); + mul_add_c(a[4],b[5],c1,c2,c3); + mul_add_c(a[5],b[4],c1,c2,c3); + mul_add_c(a[6],b[3],c1,c2,c3); + mul_add_c(a[7],b[2],c1,c2,c3); + r[9]=c1; + c1=0; + mul_add_c(a[7],b[3],c2,c3,c1); + mul_add_c(a[6],b[4],c2,c3,c1); + mul_add_c(a[5],b[5],c2,c3,c1); + mul_add_c(a[4],b[6],c2,c3,c1); + mul_add_c(a[3],b[7],c2,c3,c1); + r[10]=c2; + c2=0; + mul_add_c(a[4],b[7],c3,c1,c2); + mul_add_c(a[5],b[6],c3,c1,c2); + mul_add_c(a[6],b[5],c3,c1,c2); + mul_add_c(a[7],b[4],c3,c1,c2); + r[11]=c3; + c3=0; + mul_add_c(a[7],b[5],c1,c2,c3); + mul_add_c(a[6],b[6],c1,c2,c3); + mul_add_c(a[5],b[7],c1,c2,c3); + r[12]=c1; + c1=0; + mul_add_c(a[6],b[7],c2,c3,c1); + mul_add_c(a[7],b[6],c2,c3,c1); + r[13]=c2; + c2=0; + mul_add_c(a[7],b[7],c3,c1,c2); + r[14]=c3; + r[15]=c1; + } + +void bn_mul_comba4(r,a,b) +BN_ULONG *r,*a,*b; + { +#ifdef BN_LLONG + BN_ULLONG t; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + mul_add_c(a[0],b[0],c1,c2,c3); + r[0]=c1; + c1=0; + mul_add_c(a[0],b[1],c2,c3,c1); + mul_add_c(a[1],b[0],c2,c3,c1); + r[1]=c2; + c2=0; + mul_add_c(a[2],b[0],c3,c1,c2); + mul_add_c(a[1],b[1],c3,c1,c2); + mul_add_c(a[0],b[2],c3,c1,c2); + r[2]=c3; + c3=0; + mul_add_c(a[0],b[3],c1,c2,c3); + mul_add_c(a[1],b[2],c1,c2,c3); + mul_add_c(a[2],b[1],c1,c2,c3); + mul_add_c(a[3],b[0],c1,c2,c3); + r[3]=c1; + c1=0; + mul_add_c(a[3],b[1],c2,c3,c1); + mul_add_c(a[2],b[2],c2,c3,c1); + mul_add_c(a[1],b[3],c2,c3,c1); + r[4]=c2; + c2=0; + mul_add_c(a[2],b[3],c3,c1,c2); + mul_add_c(a[3],b[2],c3,c1,c2); + r[5]=c3; + c3=0; + mul_add_c(a[3],b[3],c1,c2,c3); + r[6]=c1; + r[7]=c2; + } + +void bn_sqr_comba8(r,a) +BN_ULONG *r,*a; + { +#ifdef BN_LLONG + BN_ULLONG t,tt; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + sqr_add_c(a,0,c1,c2,c3); + r[0]=c1; + c1=0; + sqr_add_c2(a,1,0,c2,c3,c1); + r[1]=c2; + c2=0; + sqr_add_c(a,1,c3,c1,c2); + sqr_add_c2(a,2,0,c3,c1,c2); + r[2]=c3; + c3=0; + sqr_add_c2(a,3,0,c1,c2,c3); + sqr_add_c2(a,2,1,c1,c2,c3); + r[3]=c1; + c1=0; + sqr_add_c(a,2,c2,c3,c1); + sqr_add_c2(a,3,1,c2,c3,c1); + sqr_add_c2(a,4,0,c2,c3,c1); + r[4]=c2; + c2=0; + sqr_add_c2(a,5,0,c3,c1,c2); + sqr_add_c2(a,4,1,c3,c1,c2); + sqr_add_c2(a,3,2,c3,c1,c2); + r[5]=c3; + c3=0; + sqr_add_c(a,3,c1,c2,c3); + sqr_add_c2(a,4,2,c1,c2,c3); + sqr_add_c2(a,5,1,c1,c2,c3); + sqr_add_c2(a,6,0,c1,c2,c3); + r[6]=c1; + c1=0; + sqr_add_c2(a,7,0,c2,c3,c1); + sqr_add_c2(a,6,1,c2,c3,c1); + sqr_add_c2(a,5,2,c2,c3,c1); + sqr_add_c2(a,4,3,c2,c3,c1); + r[7]=c2; + c2=0; + sqr_add_c(a,4,c3,c1,c2); + sqr_add_c2(a,5,3,c3,c1,c2); + sqr_add_c2(a,6,2,c3,c1,c2); + sqr_add_c2(a,7,1,c3,c1,c2); + r[8]=c3; + c3=0; + sqr_add_c2(a,7,2,c1,c2,c3); + sqr_add_c2(a,6,3,c1,c2,c3); + sqr_add_c2(a,5,4,c1,c2,c3); + r[9]=c1; + c1=0; + sqr_add_c(a,5,c2,c3,c1); + sqr_add_c2(a,6,4,c2,c3,c1); + sqr_add_c2(a,7,3,c2,c3,c1); + r[10]=c2; + c2=0; + sqr_add_c2(a,7,4,c3,c1,c2); + sqr_add_c2(a,6,5,c3,c1,c2); + r[11]=c3; + c3=0; + sqr_add_c(a,6,c1,c2,c3); + sqr_add_c2(a,7,5,c1,c2,c3); + r[12]=c1; + c1=0; + sqr_add_c2(a,7,6,c2,c3,c1); + r[13]=c2; + c2=0; + sqr_add_c(a,7,c3,c1,c2); + r[14]=c3; + r[15]=c1; + } + +void bn_sqr_comba4(r,a) +BN_ULONG *r,*a; + { +#ifdef BN_LLONG + BN_ULLONG t,tt; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + sqr_add_c(a,0,c1,c2,c3); + r[0]=c1; + c1=0; + sqr_add_c2(a,1,0,c2,c3,c1); + r[1]=c2; + c2=0; + sqr_add_c(a,1,c3,c1,c2); + sqr_add_c2(a,2,0,c3,c1,c2); + r[2]=c3; + c3=0; + sqr_add_c2(a,3,0,c1,c2,c3); + sqr_add_c2(a,2,1,c1,c2,c3); + r[3]=c1; + c1=0; + sqr_add_c(a,2,c2,c3,c1); + sqr_add_c2(a,3,1,c2,c3,c1); + r[4]=c2; + c2=0; + sqr_add_c2(a,3,2,c3,c1,c2); + r[5]=c3; + c3=0; + sqr_add_c(a,3,c1,c2,c3); + r[6]=c1; + r[7]=c2; + } +#else + +/* hmm... is it faster just to do a multiply? */ +void bn_sqr_comba4(r,a) +BN_ULONG *r,*a; + { + BN_ULONG t[8]; + bn_sqr_normal(r,a,4,t); + } + +void bn_sqr_comba8(r,a) +BN_ULONG *r,*a; + { + BN_ULONG t[16]; + bn_sqr_normal(r,a,8,t); + } + +void bn_mul_comba4(r,a,b) +BN_ULONG *r,*a,*b; + { + r[4]=bn_mul_words( &(r[0]),a,4,b[0]); + r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); + r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); + r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); + } + +void bn_mul_comba8(r,a,b) +BN_ULONG *r,*a,*b; + { + r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); + r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); + r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); + r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); + r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); + r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); + r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); + r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); + } + +#endif /* BN_COMBA */ diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c index a7b34f0bf0..35be32b99a 100644 --- a/crypto/bn/bn_blind.c +++ b/crypto/bn/bn_blind.c @@ -67,8 +67,14 @@ BIGNUM *mod; { BN_BLINDING *ret=NULL; + bn_check_top(Ai); + bn_check_top(mod); + if ((ret=(BN_BLINDING *)Malloc(sizeof(BN_BLINDING))) == NULL) + { BNerr(BN_F_BN_BLINDING_NEW,ERR_R_MALLOC_FAILURE); + return(NULL); + } memset(ret,0,sizeof(BN_BLINDING)); if ((ret->A=BN_new()) == NULL) goto err; if ((ret->Ai=BN_new()) == NULL) goto err; @@ -78,7 +84,7 @@ BIGNUM *mod; return(ret); err: if (ret != NULL) BN_BLINDING_free(ret); - return(ret); + return(NULL); } void BN_BLINDING_free(r) @@ -114,6 +120,8 @@ BIGNUM *n; BN_BLINDING *b; BN_CTX *ctx; { + bn_check_top(n); + if ((b->A == NULL) || (b->Ai == NULL)) { BNerr(BN_F_BN_BLINDING_CONVERT,BN_R_NOT_INITALISED); @@ -128,6 +136,8 @@ BN_BLINDING *b; BN_CTX *ctx; { int ret; + + bn_check_top(n); if ((b->A == NULL) || (b->Ai == NULL)) { BNerr(BN_F_BN_BLINDING_INVERT,BN_R_NOT_INITALISED); diff --git a/crypto/bn/bn_comba.c b/crypto/bn/bn_comba.c new file mode 100644 index 0000000000..30357cf5fb --- /dev/null +++ b/crypto/bn/bn_comba.c @@ -0,0 +1,349 @@ +/* crypto/bn/bn_comba.c */ +#include <stdio.h> +#include "bn_lcl.h" +/* Auto generated from crypto/bn/comba.pl + */ + +#undef bn_mul_comba8 +#undef bn_mul_comba4 +#undef bn_sqr_comba8 +#undef bn_sqr_comba4 + +#ifdef BN_LLONG +#define mul_add_c(a,b,c0,c1,c2) \ + t=(BN_ULLONG)a*b; \ + t1=(BN_ULONG)Lw(t); \ + t2=(BN_ULONG)Hw(t); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define mul_add_c2(a,b,c0,c1,c2) \ + t=(BN_ULLONG)a*b; \ + tt=(t+t)&BN_MASK; \ + if (tt < t) c2++; \ + t1=(BN_ULONG)Lw(tt); \ + t2=(BN_ULONG)Hw(tt); \ + c0=(c0+t1)&BN_MASK2; \ + if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c(a,i,c0,c1,c2) \ + t=(BN_ULLONG)a[i]*a[i]; \ + t1=(BN_ULONG)Lw(t); \ + t2=(BN_ULONG)Hw(t); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c2(a,i,j,c0,c1,c2) \ + mul_add_c2((a)[i],(a)[j],c0,c1,c2) +#else +#define mul_add_c(a,b,c0,c1,c2) \ + t1=LBITS(a); t2=HBITS(a); \ + bl=LBITS(b); bh=HBITS(b); \ + mul64(t1,t2,bl,bh); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define mul_add_c2(a,b,c0,c1,c2) \ + t1=LBITS(a); t2=HBITS(a); \ + bl=LBITS(b); bh=HBITS(b); \ + mul64(t1,t2,bl,bh); \ + if (t2 & BN_TBIT) c2++; \ + t2=(t2+t2)&BN_MASK2; \ + if (t1 & BN_TBIT) t2++; \ + t1=(t1+t1)&BN_MASK2; \ + c0=(c0+t1)&BN_MASK2; \ + if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c(a,i,c0,c1,c2) \ + sqr64(t1,t2,(a)[i]); \ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c2(a,i,j,c0,c1,c2) \ + mul_add_c2((a)[i],(a)[j],c0,c1,c2) +#endif + +void bn_mul_comba88(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); +void bn_mul_comba44(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); +void bn_sqr_comba88(BN_ULONG *r,BN_ULONG *a); +void bn_sqr_comba44(BN_ULONG *r,BN_ULONG *a); + +void bn_mul_comba88(r,a,b) +BN_ULONG *r,*a,*b; + { +#ifdef BN_LLONG + BN_ULLONG t; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + mul_add_c(a[0],b[0],c1,c2,c3); + r[0]=c1; + c1=0; + mul_add_c(a[0],b[1],c2,c3,c1); + mul_add_c(a[1],b[0],c2,c3,c1); + r[1]=c2; + c2=0; + mul_add_c(a[2],b[0],c3,c1,c2); + mul_add_c(a[1],b[1],c3,c1,c2); + mul_add_c(a[0],b[2],c3,c1,c2); + r[2]=c3; + c3=0; + mul_add_c(a[0],b[3],c1,c2,c3); + mul_add_c(a[1],b[2],c1,c2,c3); + mul_add_c(a[2],b[1],c1,c2,c3); + mul_add_c(a[3],b[0],c1,c2,c3); + r[3]=c1; + c1=0; + mul_add_c(a[4],b[0],c2,c3,c1); + mul_add_c(a[3],b[1],c2,c3,c1); + mul_add_c(a[2],b[2],c2,c3,c1); + mul_add_c(a[1],b[3],c2,c3,c1); + mul_add_c(a[0],b[4],c2,c3,c1); + r[4]=c2; + c2=0; + mul_add_c(a[0],b[5],c3,c1,c2); + mul_add_c(a[1],b[4],c3,c1,c2); + mul_add_c(a[2],b[3],c3,c1,c2); + mul_add_c(a[3],b[2],c3,c1,c2); + mul_add_c(a[4],b[1],c3,c1,c2); + mul_add_c(a[5],b[0],c3,c1,c2); + r[5]=c3; + c3=0; + mul_add_c(a[6],b[0],c1,c2,c3); + mul_add_c(a[5],b[1],c1,c2,c3); + mul_add_c(a[4],b[2],c1,c2,c3); + mul_add_c(a[3],b[3],c1,c2,c3); + mul_add_c(a[2],b[4],c1,c2,c3); + mul_add_c(a[1],b[5],c1,c2,c3); + mul_add_c(a[0],b[6],c1,c2,c3); + r[6]=c1; + c1=0; + mul_add_c(a[0],b[7],c2,c3,c1); + mul_add_c(a[1],b[6],c2,c3,c1); + mul_add_c(a[2],b[5],c2,c3,c1); + mul_add_c(a[3],b[4],c2,c3,c1); + mul_add_c(a[4],b[3],c2,c3,c1); + mul_add_c(a[5],b[2],c2,c3,c1); + mul_add_c(a[6],b[1],c2,c3,c1); + mul_add_c(a[7],b[0],c2,c3,c1); + r[7]=c2; + c2=0; + mul_add_c(a[7],b[1],c3,c1,c2); + mul_add_c(a[6],b[2],c3,c1,c2); + mul_add_c(a[5],b[3],c3,c1,c2); + mul_add_c(a[4],b[4],c3,c1,c2); + mul_add_c(a[3],b[5],c3,c1,c2); + mul_add_c(a[2],b[6],c3,c1,c2); + mul_add_c(a[1],b[7],c3,c1,c2); + r[8]=c3; + c3=0; + mul_add_c(a[2],b[7],c1,c2,c3); + mul_add_c(a[3],b[6],c1,c2,c3); + mul_add_c(a[4],b[5],c1,c2,c3); + mul_add_c(a[5],b[4],c1,c2,c3); + mul_add_c(a[6],b[3],c1,c2,c3); + mul_add_c(a[7],b[2],c1,c2,c3); + r[9]=c1; + c1=0; + mul_add_c(a[7],b[3],c2,c3,c1); + mul_add_c(a[6],b[4],c2,c3,c1); + mul_add_c(a[5],b[5],c2,c3,c1); + mul_add_c(a[4],b[6],c2,c3,c1); + mul_add_c(a[3],b[7],c2,c3,c1); + r[10]=c2; + c2=0; + mul_add_c(a[4],b[7],c3,c1,c2); + mul_add_c(a[5],b[6],c3,c1,c2); + mul_add_c(a[6],b[5],c3,c1,c2); + mul_add_c(a[7],b[4],c3,c1,c2); + r[11]=c3; + c3=0; + mul_add_c(a[7],b[5],c1,c2,c3); + mul_add_c(a[6],b[6],c1,c2,c3); + mul_add_c(a[5],b[7],c1,c2,c3); + r[12]=c1; + c1=0; + mul_add_c(a[6],b[7],c2,c3,c1); + mul_add_c(a[7],b[6],c2,c3,c1); + r[13]=c2; + c2=0; + mul_add_c(a[7],b[7],c3,c1,c2); + r[14]=c3; + r[15]=c1; + } + +void bn_mul_comba44(r,a,b) +BN_ULONG *r,*a,*b; + { +#ifdef BN_LLONG + BN_ULLONG t; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + mul_add_c(a[0],b[0],c1,c2,c3); + r[0]=c1; + c1=0; + mul_add_c(a[0],b[1],c2,c3,c1); + mul_add_c(a[1],b[0],c2,c3,c1); + r[1]=c2; + c2=0; + mul_add_c(a[2],b[0],c3,c1,c2); + mul_add_c(a[1],b[1],c3,c1,c2); + mul_add_c(a[0],b[2],c3,c1,c2); + r[2]=c3; + c3=0; + mul_add_c(a[0],b[3],c1,c2,c3); + mul_add_c(a[1],b[2],c1,c2,c3); + mul_add_c(a[2],b[1],c1,c2,c3); + mul_add_c(a[3],b[0],c1,c2,c3); + r[3]=c1; + c1=0; + mul_add_c(a[3],b[1],c2,c3,c1); + mul_add_c(a[2],b[2],c2,c3,c1); + mul_add_c(a[1],b[3],c2,c3,c1); + r[4]=c2; + c2=0; + mul_add_c(a[2],b[3],c3,c1,c2); + mul_add_c(a[3],b[2],c3,c1,c2); + r[5]=c3; + c3=0; + mul_add_c(a[3],b[3],c1,c2,c3); + r[6]=c1; + r[7]=c2; + } + +void bn_sqr_comba88(r,a) +BN_ULONG *r,*a; + { +#ifdef BN_LLONG + BN_ULLONG t,tt; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + sqr_add_c(a,0,c1,c2,c3); + r[0]=c1; + c1=0; + sqr_add_c2(a,1,0,c2,c3,c1); + r[1]=c2; + c2=0; + sqr_add_c(a,1,c3,c1,c2); + sqr_add_c2(a,2,0,c3,c1,c2); + r[2]=c3; + c3=0; + sqr_add_c2(a,3,0,c1,c2,c3); + sqr_add_c2(a,2,1,c1,c2,c3); + r[3]=c1; + c1=0; + sqr_add_c(a,2,c2,c3,c1); + sqr_add_c2(a,3,1,c2,c3,c1); + sqr_add_c2(a,4,0,c2,c3,c1); + r[4]=c2; + c2=0; + sqr_add_c2(a,5,0,c3,c1,c2); + sqr_add_c2(a,4,1,c3,c1,c2); + sqr_add_c2(a,3,2,c3,c1,c2); + r[5]=c3; + c3=0; + sqr_add_c(a,3,c1,c2,c3); + sqr_add_c2(a,4,2,c1,c2,c3); + sqr_add_c2(a,5,1,c1,c2,c3); + sqr_add_c2(a,6,0,c1,c2,c3); + r[6]=c1; + c1=0; + sqr_add_c2(a,7,0,c2,c3,c1); + sqr_add_c2(a,6,1,c2,c3,c1); + sqr_add_c2(a,5,2,c2,c3,c1); + sqr_add_c2(a,4,3,c2,c3,c1); + r[7]=c2; + c2=0; + sqr_add_c(a,4,c3,c1,c2); + sqr_add_c2(a,5,3,c3,c1,c2); + sqr_add_c2(a,6,2,c3,c1,c2); + sqr_add_c2(a,7,1,c3,c1,c2); + r[8]=c3; + c3=0; + sqr_add_c2(a,7,2,c1,c2,c3); + sqr_add_c2(a,6,3,c1,c2,c3); + sqr_add_c2(a,5,4,c1,c2,c3); + r[9]=c1; + c1=0; + sqr_add_c(a,5,c2,c3,c1); + sqr_add_c2(a,6,4,c2,c3,c1); + sqr_add_c2(a,7,3,c2,c3,c1); + r[10]=c2; + c2=0; + sqr_add_c2(a,7,4,c3,c1,c2); + sqr_add_c2(a,6,5,c3,c1,c2); + r[11]=c3; + c3=0; + sqr_add_c(a,6,c1,c2,c3); + sqr_add_c2(a,7,5,c1,c2,c3); + r[12]=c1; + c1=0; + sqr_add_c2(a,7,6,c2,c3,c1); + r[13]=c2; + c2=0; + sqr_add_c(a,7,c3,c1,c2); + r[14]=c3; + r[15]=c1; + } + +void bn_sqr_comba44(r,a) +BN_ULONG *r,*a; + { +#ifdef BN_LLONG + BN_ULLONG t,tt; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + + c1=0; + c2=0; + c3=0; + sqr_add_c(a,0,c1,c2,c3); + r[0]=c1; + c1=0; + sqr_add_c2(a,1,0,c2,c3,c1); + r[1]=c2; + c2=0; + sqr_add_c(a,1,c3,c1,c2); + sqr_add_c2(a,2,0,c3,c1,c2); + r[2]=c3; + c3=0; + sqr_add_c2(a,3,0,c1,c2,c3); + sqr_add_c2(a,2,1,c1,c2,c3); + r[3]=c1; + c1=0; + sqr_add_c(a,2,c2,c3,c1); + sqr_add_c2(a,3,1,c2,c3,c1); + r[4]=c2; + c2=0; + sqr_add_c2(a,3,2,c3,c1,c2); + r[5]=c3; + c3=0; + sqr_add_c(a,3,c1,c2,c3); + r[6]=c1; + r[7]=c2; + } diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c index 2263bdc7da..c7bc04d0b4 100644 --- a/crypto/bn/bn_div.c +++ b/crypto/bn/bn_div.c @@ -72,6 +72,8 @@ BN_CTX *ctx; int i,nm,nd; BIGNUM *D; + bn_check_top(m); + bn_check_top(d); if (BN_is_zero(d)) { BNerr(BN_F_BN_DIV,BN_R_DIV_BY_ZERO); @@ -86,9 +88,9 @@ BN_CTX *ctx; return(1); } - D=ctx->bn[ctx->tos]; - if (dv == NULL) dv=ctx->bn[ctx->tos+1]; - if (rem == NULL) rem=ctx->bn[ctx->tos+2]; + D= &(ctx->bn[ctx->tos]); + if (dv == NULL) dv= &(ctx->bn[ctx->tos+1]); + if (rem == NULL) rem= &(ctx->bn[ctx->tos+2]); nd=BN_num_bits(d); nm=BN_num_bits(m); @@ -98,6 +100,7 @@ BN_CTX *ctx; /* The next 2 are needed so we can do a dv->d[0]|=1 later * since BN_lshift1 will only work once there is a value :-) */ BN_zero(dv); + bn_wexpand(dv,1); dv->top=1; if (!BN_lshift(D,D,nm-nd)) return(0); @@ -107,7 +110,7 @@ BN_CTX *ctx; if (BN_ucmp(rem,D) >= 0) { dv->d[0]|=1; - bn_qsub(rem,rem,D); + if (!BN_usub(rem,rem,D)) return(0); } /* CAN IMPROVE (and have now :=) */ if (!BN_rshift1(D,D)) return(0); @@ -132,6 +135,9 @@ BN_CTX *ctx; BN_ULONG d0,d1; int num_n,div_n; + bn_check_top(num); + bn_check_top(divisor); + if (BN_is_zero(divisor)) { BNerr(BN_F_BN_DIV,BN_R_DIV_BY_ZERO); @@ -146,12 +152,12 @@ BN_CTX *ctx; return(1); } - tmp=ctx->bn[ctx->tos]; + tmp= &(ctx->bn[ctx->tos]); tmp->neg=0; - snum=ctx->bn[ctx->tos+1]; - sdiv=ctx->bn[ctx->tos+2]; + snum= &(ctx->bn[ctx->tos+1]); + sdiv= &(ctx->bn[ctx->tos+2]); if (dv == NULL) - res=ctx->bn[ctx->tos+3]; + res= &(ctx->bn[ctx->tos+3]); else res=dv; /* First we normalise the numbers */ @@ -168,10 +174,10 @@ BN_CTX *ctx; /* Lets setup a 'window' into snum * This is the part that corresponds to the current * 'area' being divided */ + BN_init(&wnum); wnum.d= &(snum->d[loop]); wnum.top= div_n; - wnum.max= snum->max; /* a bit of a lie */ - wnum.neg= 0; + wnum.max= snum->max+1; /* a bit of a lie */ /* Get the top 2 words of sdiv */ /* i=sdiv->top; */ @@ -183,8 +189,8 @@ BN_CTX *ctx; /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); - res->top=loop; if (!bn_wexpand(res,(loop+1))) goto err; + res->top=loop; resp= &(res->d[loop-1]); /* space for temp */ @@ -192,7 +198,7 @@ BN_CTX *ctx; if (BN_ucmp(&wnum,sdiv) >= 0) { - bn_qsub(&wnum,&wnum,sdiv); + if (!BN_usub(&wnum,&wnum,sdiv)) goto err; *resp=1; res->d[res->top-1]=1; } @@ -211,7 +217,7 @@ BN_CTX *ctx; if (n0 == d0) q=BN_MASK2; else - q=bn_div64(n0,n1,d0); + q=bn_div_words(n0,n1,d0); { #ifdef BN_LLONG BN_ULLONG t1,t2,rem; @@ -284,3 +290,39 @@ err: } #endif + +/* rem != m */ +int BN_mod(rem, m, d,ctx) +BIGNUM *rem; +BIGNUM *m; +BIGNUM *d; +BN_CTX *ctx; + { +#if 0 /* The old slow way */ + int i,nm,nd; + BIGNUM *dv; + + if (BN_ucmp(m,d) < 0) + return((BN_copy(rem,m) == NULL)?0:1); + + dv= &(ctx->bn[ctx->tos]); + + if (!BN_copy(rem,m)) return(0); + + nm=BN_num_bits(rem); + nd=BN_num_bits(d); + if (!BN_lshift(dv,d,nm-nd)) return(0); + for (i=nm-nd; i>=0; i--) + { + if (BN_cmp(rem,dv) >= 0) + { + if (!BN_sub(rem,rem,dv)) return(0); + } + if (!BN_rshift1(dv,dv)) return(0); + } + return(1); +#else + return(BN_div(NULL,rem,m,d,ctx)); +#endif + } + diff --git a/crypto/bn/bn_err.c b/crypto/bn/bn_err.c index 029ae810d5..4c29c1ac55 100644 --- a/crypto/bn/bn_err.c +++ b/crypto/bn/bn_err.c @@ -78,15 +78,18 @@ static ERR_STRING_DATA BN_str_functs[]= {ERR_PACK(0,BN_F_BN_MPI2BN,0), "BN_mpi2bn"}, {ERR_PACK(0,BN_F_BN_NEW,0), "BN_new"}, {ERR_PACK(0,BN_F_BN_RAND,0), "BN_rand"}, +{ERR_PACK(0,BN_F_BN_USUB,0), "BN_usub"}, {0,NULL}, }; static ERR_STRING_DATA BN_str_reasons[]= { +{BN_R_ARG2_LT_ARG3 ,"arg2 lt arg3"}, {BN_R_BAD_RECIPROCAL ,"bad reciprocal"}, {BN_R_CALLED_WITH_EVEN_MODULUS ,"called with even modulus"}, {BN_R_DIV_BY_ZERO ,"div by zero"}, {BN_R_ENCODING_ERROR ,"encoding error"}, +{BN_R_EXPAND_ON_STATIC_BIGNUM_DATA ,"expand on static bignum data"}, {BN_R_INVALID_LENGTH ,"invalid length"}, {BN_R_NOT_INITALISED ,"not initalised"}, {BN_R_NO_INVERSE ,"no inverse"}, @@ -99,8 +102,8 @@ void ERR_load_BN_strings() { static int init=1; - if (init); - {; + if (init) + { init=0; #ifndef NO_ERR ERR_load_strings(ERR_LIB_BN,BN_str_functs); diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index c056a5083f..44f47e7eb2 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -60,6 +60,8 @@ #include "cryptlib.h" #include "bn_lcl.h" +#define TABLE_SIZE 16 + /* slow but works */ int BN_mod_mul(ret, a, b, m, ctx) BIGNUM *ret; @@ -71,11 +73,15 @@ BN_CTX *ctx; BIGNUM *t; int r=0; - t=ctx->bn[ctx->tos++]; + bn_check_top(a); + bn_check_top(b); + bn_check_top(m); + + t= &(ctx->bn[ctx->tos++]); if (a == b) { if (!BN_sqr(t,a,ctx)) goto err; } else - { if (!BN_mul(t,a,b)) goto err; } + { if (!BN_mul(t,a,b,ctx)) goto err; } if (!BN_mod(ret,t,m,ctx)) goto err; r=1; err: @@ -92,8 +98,8 @@ BN_CTX *ctx; int i,bits,ret=0; BIGNUM *v,*tmp; - v=ctx->bn[ctx->tos++]; - tmp=ctx->bn[ctx->tos++]; + v= &(ctx->bn[ctx->tos++]); + tmp= &(ctx->bn[ctx->tos++]); if (BN_copy(v,a) == NULL) goto err; bits=BN_num_bits(p); @@ -108,7 +114,7 @@ BN_CTX *ctx; if (!BN_mod(v,tmp,m,ctx)) goto err; if (BN_is_bit_set(p,i)) { - if (!BN_mul(tmp,r,v)) goto err; + if (!BN_mul(tmp,r,v,ctx)) goto err; if (!BN_mod(r,tmp,m,ctx)) goto err; } } @@ -128,8 +134,8 @@ BN_CTX *ctx; int i,bits,ret=0; BIGNUM *v,*tmp; - v=ctx->bn[ctx->tos++]; - tmp=ctx->bn[ctx->tos++]; + v= &(ctx->bn[ctx->tos++]); + tmp= &(ctx->bn[ctx->tos++]); if (BN_copy(v,a) == NULL) goto err; bits=BN_num_bits(p); @@ -143,7 +149,7 @@ BN_CTX *ctx; if (!BN_sqr(tmp,v,ctx)) goto err; if (BN_is_bit_set(p,i)) { - if (!BN_mul(tmp,r,v)) goto err; + if (!BN_mul(tmp,r,v,ctx)) goto err; } } ret=1; @@ -161,6 +167,10 @@ BN_CTX *ctx; { int ret; + bn_check_top(a); + bn_check_top(p); + bn_check_top(m); + #ifdef MONT_MUL_MOD /* I have finally been able to take out this pre-condition of * the top bit being set. It was caused by an error in BN_div @@ -189,13 +199,13 @@ BIGNUM *p; BIGNUM *m; BN_CTX *ctx; { - int nb,i,j,bits,ret=0,wstart,wend,window,wvalue; - int start=1; - BIGNUM *d,*aa; - BIGNUM *val[16]; + int i,j,bits,ret=0,wstart,wend,window,wvalue; + int start=1,ts=0; + BIGNUM *aa; + BIGNUM val[TABLE_SIZE]; + BN_RECP_CTX recp; - d=ctx->bn[ctx->tos++]; - aa=ctx->bn[ctx->tos++]; + aa= &(ctx->bn[ctx->tos++]); bits=BN_num_bits(p); if (bits == 0) @@ -203,12 +213,14 @@ BN_CTX *ctx; BN_one(r); return(1); } - nb=BN_reciprocal(d,m,ctx); - if (nb == -1) goto err; + BN_RECP_CTX_init(&recp); + if (BN_RECP_CTX_set(&recp,m,ctx) <= 0) goto err; + + BN_init(&(val[0])); + ts=1; - val[0]=BN_new(); - if (!BN_mod(val[0],a,m,ctx)) goto err; /* 1 */ - if (!BN_mod_mul_reciprocal(aa,val[0],val[0],m,d,nb,ctx)) + if (!BN_mod(&(val[0]),a,m,ctx)) goto err; /* 1 */ + if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx)) goto err; /* 2 */ if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */ @@ -223,12 +235,11 @@ BN_CTX *ctx; j=1<<(window-1); for (i=1; i<j; i++) { - val[i]=BN_new(); - if (!BN_mod_mul_reciprocal(val[i],val[i-1],aa,m,d,nb,ctx)) + BN_init(&val[i]); + if (!BN_mod_mul_reciprocal(&(val[i]),&(val[i-1]),aa,&recp,ctx)) goto err; } - for (; i<16; i++) - val[i]=NULL; + ts=i; start=1; /* This is used to avoid multiplication etc * when there is only the value '1' in the @@ -244,7 +255,7 @@ BN_CTX *ctx; if (BN_is_bit_set(p,wstart) == 0) { if (!start) - if (!BN_mod_mul_reciprocal(r,r,r,m,d,nb,ctx)) + if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx)) goto err; if (wstart == 0) break; wstart--; @@ -274,12 +285,12 @@ BN_CTX *ctx; if (!start) for (i=0; i<j; i++) { - if (!BN_mod_mul_reciprocal(r,r,r,m,d,nb,ctx)) + if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx)) goto err; } /* wvalue will be an odd number < 2^window */ - if (!BN_mod_mul_reciprocal(r,r,val[wvalue>>1],m,d,nb,ctx)) + if (!BN_mod_mul_reciprocal(r,r,&(val[wvalue>>1]),&recp,ctx)) goto err; /* move the 'window' down further */ @@ -290,35 +301,40 @@ BN_CTX *ctx; } ret=1; err: - ctx->tos-=2; - for (i=0; i<16; i++) - if (val[i] != NULL) BN_clear_free(val[i]); + ctx->tos--; + for (i=0; i<ts; i++) + BN_clear_free(&(val[i])); + BN_RECP_CTX_free(&recp); return(ret); } /* #endif */ /* #ifdef MONT_MUL_MOD */ -int BN_mod_exp_mont(r,a,p,m,ctx,in_mont) -BIGNUM *r; +int BN_mod_exp_mont(rr,a,p,m,ctx,in_mont) +BIGNUM *rr; BIGNUM *a; BIGNUM *p; BIGNUM *m; BN_CTX *ctx; BN_MONT_CTX *in_mont; { -#define TABLE_SIZE 16 int i,j,bits,ret=0,wstart,wend,window,wvalue; - int start=1; - BIGNUM *d,*aa; - BIGNUM *val[TABLE_SIZE]; + int start=1,ts=0; + BIGNUM *d,*aa,*r; + BIGNUM val[TABLE_SIZE]; BN_MONT_CTX *mont=NULL; + bn_check_top(a); + bn_check_top(p); + bn_check_top(m); + if (!(m->d[0] & 1)) { BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS); return(0); } - d=ctx->bn[ctx->tos++]; + d= &(ctx->bn[ctx->tos++]); + r= &(ctx->bn[ctx->tos++]); bits=BN_num_bits(p); if (bits == 0) { @@ -339,22 +355,23 @@ BN_MONT_CTX *in_mont; if (!BN_MONT_CTX_set(mont,m,ctx)) goto err; } - val[0]=BN_new(); + BN_init(&val[0]); + ts=1; if (BN_ucmp(a,m) >= 0) { - BN_mod(val[0],a,m,ctx); - aa=val[0]; + BN_mod(&(val[0]),a,m,ctx); + aa= &(val[0]); } else aa=a; - if (!BN_to_montgomery(val[0],aa,mont,ctx)) goto err; /* 1 */ - if (!BN_mod_mul_montgomery(d,val[0],val[0],mont,ctx)) goto err; /* 2 */ + if (!BN_to_montgomery(&(val[0]),aa,mont,ctx)) goto err; /* 1 */ + if (!BN_mod_mul_montgomery(d,&(val[0]),&(val[0]),mont,ctx)) goto err; /* 2 */ if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */ window=1; - else if (bits > 250) + else if (bits >= 256) window=5; /* max size of window */ - else if (bits >= 120) + else if (bits >= 128) window=4; else window=3; @@ -362,12 +379,11 @@ BN_MONT_CTX *in_mont; j=1<<(window-1); for (i=1; i<j; i++) { - val[i]=BN_new(); - if (!BN_mod_mul_montgomery(val[i],val[i-1],d,mont,ctx)) + BN_init(&(val[i])); + if (!BN_mod_mul_montgomery(&(val[i]),&(val[i-1]),d,mont,ctx)) goto err; } - for (; i<TABLE_SIZE; i++) - val[i]=NULL; + ts=i; start=1; /* This is used to avoid multiplication etc * when there is only the value '1' in the @@ -419,7 +435,7 @@ BN_MONT_CTX *in_mont; } /* wvalue will be an odd number < 2^window */ - if (!BN_mod_mul_montgomery(r,r,val[wvalue>>1],mont,ctx)) + if (!BN_mod_mul_montgomery(r,r,&(val[wvalue>>1]),mont,ctx)) goto err; /* move the 'window' down further */ @@ -428,13 +444,13 @@ BN_MONT_CTX *in_mont; start=0; if (wstart < 0) break; } - BN_from_montgomery(r,r,mont,ctx); + BN_from_montgomery(rr,r,mont,ctx); ret=1; err: if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont); - ctx->tos--; - for (i=0; i<TABLE_SIZE; i++) - if (val[i] != NULL) BN_clear_free(val[i]); + ctx->tos-=2; + for (i=0; i<ts; i++) + BN_clear_free(&(val[i])); return(ret); } /* #endif */ @@ -447,12 +463,12 @@ BIGNUM *p; BIGNUM *m; BN_CTX *ctx; { - int i,j,bits,ret=0,wstart,wend,window,wvalue; + int i,j,bits,ret=0,wstart,wend,window,wvalue,ts=0; int start=1; BIGNUM *d; - BIGNUM *val[16]; + BIGNUM val[TABLE_SIZE]; - d=ctx->bn[ctx->tos++]; + d= &(ctx->bn[ctx->tos++]); bits=BN_num_bits(p); if (bits == 0) @@ -461,9 +477,10 @@ BN_CTX *ctx; return(1); } - val[0]=BN_new(); - if (!BN_mod(val[0],a,m,ctx)) goto err; /* 1 */ - if (!BN_mod_mul(d,val[0],val[0],m,ctx)) + BN_init(&(val[0])); + ts=1; + if (!BN_mod(&(val[0]),a,m,ctx)) goto err; /* 1 */ + if (!BN_mod_mul(d,&(val[0]),&(val[0]),m,ctx)) goto err; /* 2 */ if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */ @@ -478,12 +495,11 @@ BN_CTX *ctx; j=1<<(window-1); for (i=1; i<j; i++) { - val[i]=BN_new(); - if (!BN_mod_mul(val[i],val[i-1],d,m,ctx)) + BN_init(&(val[i])); + if (!BN_mod_mul(&(val[i]),&(val[i-1]),d,m,ctx)) goto err; } - for (; i<16; i++) - val[i]=NULL; + ts=i; start=1; /* This is used to avoid multiplication etc * when there is only the value '1' in the @@ -534,7 +550,7 @@ BN_CTX *ctx; } /* wvalue will be an odd number < 2^window */ - if (!BN_mod_mul(r,r,val[wvalue>>1],m,ctx)) + if (!BN_mod_mul(r,r,&(val[wvalue>>1]),m,ctx)) goto err; /* move the 'window' down further */ @@ -546,8 +562,8 @@ BN_CTX *ctx; ret=1; err: ctx->tos--; - for (i=0; i<16; i++) - if (val[i] != NULL) BN_clear_free(val[i]); + for (i=0; i<ts; i++) + BN_clear_free(&(val[i])); return(ret); } diff --git a/crypto/bn/bn_exp2.c b/crypto/bn/bn_exp2.c new file mode 100644 index 0000000000..eface739b3 --- /dev/null +++ b/crypto/bn/bn_exp2.c @@ -0,0 +1,202 @@ +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +/* I've done some timing with different table sizes. + * The main hassle is that even with bits set at 3, this requires + * 63 BIGNUMs to store the pre-calculated values. + * 512 1024 + * bits=1 75.4% 79.4% + * bits=2 61.2% 62.4% + * bits=3 61.3% 59.3% + * The lack of speed improvment is also a function of the pre-calculation + * which could be removed. + */ +#define EXP2_TABLE_BITS 2 /* 1 2 3 4 5 */ +#define EXP2_TABLE_SIZE 4 /* 2 4 8 16 32 */ + +int BN_mod_exp2_mont(rr,a1,p1,a2,p2,m,ctx,in_mont) +BIGNUM *rr; +BIGNUM *a1; +BIGNUM *p1; +BIGNUM *a2; +BIGNUM *p2; +BIGNUM *m; +BN_CTX *ctx; +BN_MONT_CTX *in_mont; + { + int i,j,k,bits,bits1,bits2,ret=0,wstart,wend,window,xvalue,yvalue; + int start=1,ts=0,x,y; + BIGNUM *d,*aa1,*aa2,*r; + BIGNUM val[EXP2_TABLE_SIZE][EXP2_TABLE_SIZE]; + BN_MONT_CTX *mont=NULL; + + bn_check_top(a1); + bn_check_top(p1); + bn_check_top(a2); + bn_check_top(p2); + bn_check_top(m); + + if (!(m->d[0] & 1)) + { + BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS); + return(0); + } + d= &(ctx->bn[ctx->tos++]); + r= &(ctx->bn[ctx->tos++]); + bits1=BN_num_bits(p1); + bits2=BN_num_bits(p2); + if ((bits1 == 0) && (bits2 == 0)) + { + BN_one(r); + return(1); + } + bits=(bits1 > bits2)?bits1:bits2; + + /* If this is not done, things will break in the montgomery + * part */ + + if (in_mont != NULL) + mont=in_mont; + else + { + if ((mont=BN_MONT_CTX_new()) == NULL) goto err; + if (!BN_MONT_CTX_set(mont,m,ctx)) goto err; + } + + BN_init(&(val[0][0])); + BN_init(&(val[1][1])); + BN_init(&(val[0][1])); + BN_init(&(val[1][0])); + ts=1; + if (BN_ucmp(a1,m) >= 0) + { + BN_mod(&(val[1][0]),a1,m,ctx); + aa1= &(val[1][0]); + } + else + aa1=a1; + if (BN_ucmp(a2,m) >= 0) + { + BN_mod(&(val[0][1]),a2,m,ctx); + aa2= &(val[0][1]); + } + else + aa2=a2; + if (!BN_to_montgomery(&(val[1][0]),aa1,mont,ctx)) goto err; + if (!BN_to_montgomery(&(val[0][1]),aa2,mont,ctx)) goto err; + if (!BN_mod_mul_montgomery(&(val[1][1]), + &(val[1][0]),&(val[0][1]),mont,ctx)) + goto err; + +#if 0 + if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */ + window=1; + else if (bits > 250) + window=5; /* max size of window */ + else if (bits >= 120) + window=4; + else + window=3; +#else + window=EXP2_TABLE_BITS; +#endif + + k=1<<window; + for (x=0; x<k; x++) + { + if (x >= 2) + { + BN_init(&(val[x][0])); + BN_init(&(val[x][1])); + if (!BN_mod_mul_montgomery(&(val[x][0]), + &(val[1][0]),&(val[x-1][0]),mont,ctx)) goto err; + if (!BN_mod_mul_montgomery(&(val[x][1]), + &(val[1][0]),&(val[x-1][1]),mont,ctx)) goto err; + } + for (y=2; y<k; y++) + { + BN_init(&(val[x][y])); + if (!BN_mod_mul_montgomery(&(val[x][y]), + &(val[x][y-1]),&(val[0][1]),mont,ctx)) + goto err; + } + } + ts=k; + + start=1; /* This is used to avoid multiplication etc + * when there is only the value '1' in the + * buffer. */ + xvalue=0; /* The 'x value' of the window */ + yvalue=0; /* The 'y value' of the window */ + wstart=bits-1; /* The top bit of the window */ + wend=0; /* The bottom bit of the window */ + + if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; + for (;;) + { + xvalue=BN_is_bit_set(p1,wstart); + yvalue=BN_is_bit_set(p2,wstart); + if (!(xvalue || yvalue)) + { + if (!start) + { + if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) + goto err; + } + wstart--; + if (wstart < 0) break; + continue; + } + /* We now have wstart on a 'set' bit, we now need to work out + * how bit a window to do. To do this we need to scan + * forward until the last set bit before the end of the + * window */ + j=wstart; + /* xvalue=BN_is_bit_set(p1,wstart); already set */ + /* yvalue=BN_is_bit_set(p1,wstart); already set */ + wend=0; + for (i=1; i<window; i++) + { + if (wstart-i < 0) break; + xvalue+=xvalue; + xvalue|=BN_is_bit_set(p1,wstart-i); + yvalue+=yvalue; + yvalue|=BN_is_bit_set(p2,wstart-i); + } + + /* i is the size of the current window */ + /* add the 'bytes above' */ + if (!start) + for (j=0; j<i; j++) + { + if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) + goto err; + } + + /* wvalue will be an odd number < 2^window */ + if (xvalue || yvalue) + { + if (!BN_mod_mul_montgomery(r,r,&(val[xvalue][yvalue]), + mont,ctx)) goto err; + } + + /* move the 'window' down further */ + wstart-=i; + start=0; + if (wstart < 0) break; + } + BN_from_montgomery(rr,r,mont,ctx); + ret=1; +err: + if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont); + ctx->tos-=2; + for (i=0; i<ts; i++) + { + for (j=0; j<ts; j++) + { + BN_clear_free(&(val[i][j])); + } + } + return(ret); + } diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c index 071bba3b4b..c80cecdc8d 100644 --- a/crypto/bn/bn_gcd.c +++ b/crypto/bn/bn_gcd.c @@ -73,8 +73,11 @@ BN_CTX *ctx; BIGNUM *a,*b,*t; int ret=0; - a=ctx->bn[ctx->tos]; - b=ctx->bn[ctx->tos+1]; + bn_check_top(in_a); + bn_check_top(in_b); + + a= &(ctx->bn[ctx->tos]); + b= &(ctx->bn[ctx->tos+1]); if (BN_copy(a,in_a) == NULL) goto err; if (BN_copy(b,in_b) == NULL) goto err; @@ -95,6 +98,9 @@ BIGNUM *a,*b; BIGNUM *t; int shifts=0; + bn_check_top(a); + bn_check_top(b); + for (;;) { if (BN_is_zero(b)) @@ -142,23 +148,30 @@ err: } /* solves ax == 1 (mod n) */ -BIGNUM *BN_mod_inverse(a, n, ctx) +BIGNUM *BN_mod_inverse(in, a, n, ctx) +BIGNUM *in; BIGNUM *a; BIGNUM *n; BN_CTX *ctx; { BIGNUM *A,*B,*X,*Y,*M,*D,*R; - BIGNUM *ret=NULL,*T; + BIGNUM *T,*ret=NULL; int sign; - A=ctx->bn[ctx->tos]; - B=ctx->bn[ctx->tos+1]; - X=ctx->bn[ctx->tos+2]; - D=ctx->bn[ctx->tos+3]; - M=ctx->bn[ctx->tos+4]; - Y=ctx->bn[ctx->tos+5]; + bn_check_top(a); + bn_check_top(n); + + A= &(ctx->bn[ctx->tos]); + B= &(ctx->bn[ctx->tos+1]); + X= &(ctx->bn[ctx->tos+2]); + D= &(ctx->bn[ctx->tos+3]); + M= &(ctx->bn[ctx->tos+4]); + Y= &(ctx->bn[ctx->tos+5]); ctx->tos+=6; - R=BN_new(); + if (in == NULL) + R=BN_new(); + else + R=in; if (R == NULL) goto err; BN_zero(X); @@ -175,7 +188,7 @@ BN_CTX *ctx; B=M; /* T has a struct, M does not */ - if (!BN_mul(T,D,X)) goto err; + if (!BN_mul(T,D,X,ctx)) goto err; if (!BN_add(T,T,Y)) goto err; M=Y; Y=X; @@ -196,7 +209,7 @@ BN_CTX *ctx; } ret=R; err: - if ((ret == NULL) && (R != NULL)) BN_free(R); + if ((ret == NULL) && (in == NULL)) BN_free(R); ctx->tos-=6; return(ret); } diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h index edfd788338..70b0787d8f 100644 --- a/crypto/bn/bn_lcl.h +++ b/crypto/bn/bn_lcl.h @@ -65,17 +65,68 @@ extern "C" { #endif +/* Pentium pro 16,16,16,32,64 */ +/* Alpha 16,16,16,16.64 */ +#define BN_MULL_SIZE_NORMAL (16) // 32 +#define BN_MUL_RECURSIVE_SIZE_NORMAL (16) // 32 /* less than */ +#define BN_SQR_RECURSIVE_SIZE_NORMAL (16) // 32 +#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) // 32 +#define BN_MONT_CTX_SET_SIZE_WORD (64) // 32 + +#ifndef BN_MUL_COMBA +#define bn_mul_comba8(r,a,b) bn_mul_normal(r,a,8,b,8) +#define bn_mul_comba4(r,a,b) bn_mul_normal(r,a,4,b,4) +/* This is probably faster than using the C code - I need to check */ +#define bn_sqr_comba8(r,a) bn_mul_normal(r,a,8,a,8) +#define bn_sqr_comba4(r,a) bn_mul_normal(r,a,4,a,4) +#endif + /************************************************************* * Using the long long type */ #define Lw(t) (((BN_ULONG)(t))&BN_MASK2) #define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2) -#define bn_fix_top(a) \ - { \ - BN_ULONG *fix_top_l; \ - for (fix_top_l= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \ - if (*(fix_top_l--)) break; \ +/* These are used for internal error checking and are not normally used */ +#ifdef BN_DEBUG +#define bn_check_top(a) \ + { if (((a)->top < 0) || ((a)->top > (a)->max)) \ + { char *nullp=NULL; *nullp='z'; } } +#define bn_check_num(a) if ((a) < 0) { char *nullp=NULL; *nullp='z'; } +#else +#define bn_check_top(a) +#define bn_check_num(a) +#endif + +/* This macro is to add extra stuff for development checking */ +#ifdef BN_DEBUG +#define bn_set_max(r) ((r)->max=(r)->top,BN_set_flags((r),BN_FLG_STATIC_DATA)) +#else +#define bn_set_max(r) +#endif + +/* These macros are used to 'take' a section of a bignum for read only use */ +#define bn_set_low(r,a,n) \ + { \ + (r)->top=((a)->top > (n))?(n):(a)->top; \ + (r)->d=(a)->d; \ + (r)->neg=(a)->neg; \ + (r)->flags|=BN_FLG_STATIC_DATA; \ + bn_set_max(r); \ + } + +#define bn_set_high(r,a,n) \ + { \ + if ((a)->top > (n)) \ + { \ + (r)->top=(a)->top-n; \ + (r)->d= &((a)->d[n]); \ + } \ + else \ + (r)->top=0; \ + (r)->neg=(a)->neg; \ + (r)->flags|=BN_FLG_STATIC_DATA; \ + bn_set_max(r); \ } /* #define bn_expand(n,b) ((((b)/BN_BITS2) <= (n)->max)?(n):bn_expand2((n),(b))) */ @@ -175,6 +226,17 @@ extern "C" { #endif +extern int bn_limit_bits; +extern int bn_limit_num; /* (1<<bn_limit_bits) */ +/* Recursive 'low' limit */ +extern int bn_limit_bits_low; +extern int bn_limit_num_low; /* (1<<bn_limit_bits_low) */ +/* Do modified 'high' part calculation' */ +extern int bn_limit_bits_high; +extern int bn_limit_num_high; /* (1<<bn_limit_bits_high) */ +extern int bn_limit_bits_mont; +extern int bn_limit_num_mont; /* (1<<bn_limit_bits_mont) */ + #ifndef NOPROTO BIGNUM *bn_expand2(BIGNUM *b, int bits); @@ -197,3 +259,8 @@ BN_ULONG bn_add_words(); #endif #endif + +void bn_mul_low_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,BN_ULONG *t); +void bn_mul_high(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,BN_ULONG *l,int n2, BN_ULONG *t); + + diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c index bfe7628ad4..7ea216f919 100644 --- a/crypto/bn/bn_lib.c +++ b/crypto/bn/bn_lib.c @@ -60,7 +60,68 @@ #include "cryptlib.h" #include "bn_lcl.h" -char *BN_version="Big Number part of SSLeay 0.9.0b 29-Jun-1998"; +char *BN_version="Big Number part of SSLeay 0.9.1a 06-Jul-1998"; + +/* For a 32 bit machine + * 2 - 4 == 128 + * 3 - 8 == 256 + * 4 - 16 == 512 + * 5 - 32 == 1024 + * 6 - 64 == 2048 + * 7 - 128 == 4096 + * 8 - 256 == 8192 + */ +int bn_limit_bits=0; +int bn_limit_num=8; /* (1<<bn_limit_bits) */ +int bn_limit_bits_low=0; +int bn_limit_num_low=8; /* (1<<bn_limit_bits_low) */ +int bn_limit_bits_high=0; +int bn_limit_num_high=8; /* (1<<bn_limit_bits_high) */ +int bn_limit_bits_mont=0; +int bn_limit_num_mont=8; /* (1<<bn_limit_bits_mont) */ + +void BN_set_params(mult,high,low,mont) +int mult,high,low,mont; + { + if (mult >= 0) + { + if (mult > (sizeof(int)*8)-1) + mult=sizeof(int)*8-1; + bn_limit_bits=mult; + bn_limit_num=1<<mult; + } + if (high >= 0) + { + if (high > (sizeof(int)*8)-1) + high=sizeof(int)*8-1; + bn_limit_bits_high=high; + bn_limit_num_high=1<<high; + } + if (low >= 0) + { + if (low > (sizeof(int)*8)-1) + low=sizeof(int)*8-1; + bn_limit_bits_low=low; + bn_limit_num_low=1<<low; + } + if (mont >= 0) + { + if (mont > (sizeof(int)*8)-1) + mont=sizeof(int)*8-1; + bn_limit_bits_mont=mont; + bn_limit_num_mont=1<<mont; + } + } + +int BN_get_params(which) +int which; + { + if (which == 0) return(bn_limit_bits); + else if (which == 1) return(bn_limit_bits_high); + else if (which == 2) return(bn_limit_bits_low); + else if (which == 3) return(bn_limit_bits_mont); + else return(0); + } BIGNUM *BN_value_one() { @@ -111,24 +172,24 @@ BN_ULONG l; 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, }; -#ifdef SIXTY_FOUR_BIT_LONG +#if defined(SIXTY_FOUR_BIT_LONG) if (l & 0xffffffff00000000L) { if (l & 0xffff000000000000L) { if (l & 0xff00000000000000L) { - return(bits[l>>56]+56); + return(bits[(int)(l>>56)]+56); } - else return(bits[l>>48]+48); + else return(bits[(int)(l>>48)]+48); } else { if (l & 0x0000ff0000000000L) { - return(bits[l>>40]+40); + return(bits[(int)(l>>40)]+40); } - else return(bits[l>>32]+32); + else return(bits[(int)(l>>32)]+32); } } else @@ -140,17 +201,17 @@ BN_ULONG l; { if (l & 0xff00000000000000LL) { - return(bits[l>>56]+56); + return(bits[(int)(l>>56)]+56); } - else return(bits[l>>48]+48); + else return(bits[(int)(l>>48)]+48); } else { if (l & 0x0000ff0000000000LL) { - return(bits[l>>40]+40); + return(bits[(int)(l>>40)]+40); } - else return(bits[l>>32]+32); + else return(bits[(int)(l>>32)]+32); } } else @@ -161,18 +222,18 @@ BN_ULONG l; if (l & 0xffff0000L) { if (l & 0xff000000L) - return(bits[l>>24L]+24); - else return(bits[l>>16L]+16); + return(bits[(int)(l>>24L)]+24); + else return(bits[(int)(l>>16L)]+16); } else #endif { #if defined(SIXTEEN_BIT) || defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG) if (l & 0xff00L) - return(bits[l>>8]+8); + return(bits[(int)(l>>8)]+8); else #endif - return(bits[l ] ); + return(bits[(int)(l )] ); } } } @@ -183,6 +244,8 @@ BIGNUM *a; BN_ULONG l; int i; + bn_check_top(a); + if (a->top == 0) return(0); l=a->d[a->top-1]; i=(a->top-1)*BN_BITS2; @@ -199,74 +262,78 @@ BIGNUM *a; void BN_clear_free(a) BIGNUM *a; { + int i; + if (a == NULL) return; if (a->d != NULL) { memset(a->d,0,a->max*sizeof(a->d[0])); - Free(a->d); + if (!(BN_get_flags(a,BN_FLG_STATIC_DATA))) + Free(a->d); } + i=BN_get_flags(a,BN_FLG_MALLOCED); memset(a,0,sizeof(BIGNUM)); - Free(a); + if (i) + Free(a); } void BN_free(a) BIGNUM *a; { if (a == NULL) return; - if (a->d != NULL) Free(a->d); - Free(a); + if ((a->d != NULL) && !(BN_get_flags(a,BN_FLG_STATIC_DATA))) + Free(a->d); + a->flags|=BN_FLG_FREE; /* REMOVE? */ + if (a->flags & BN_FLG_MALLOCED) + Free(a); + } + +void BN_init(a) +BIGNUM *a; + { + memset(a,0,sizeof(BIGNUM)); } BIGNUM *BN_new() { BIGNUM *ret; - BN_ULONG *p; - ret=(BIGNUM *)Malloc(sizeof(BIGNUM)); - if (ret == NULL) goto err; + if ((ret=(BIGNUM *)Malloc(sizeof(BIGNUM))) == NULL) + { + BNerr(BN_F_BN_NEW,ERR_R_MALLOC_FAILURE); + return(NULL); + } + ret->flags=BN_FLG_MALLOCED; ret->top=0; ret->neg=0; - ret->max=(BN_DEFAULT_BITS/BN_BITS2); - p=(BN_ULONG *)Malloc(sizeof(BN_ULONG)*(ret->max+1)); - if (p == NULL) goto err; - ret->d=p; - - memset(p,0,(ret->max+1)*sizeof(p[0])); + ret->max=0; + ret->d=NULL; return(ret); -err: - BNerr(BN_F_BN_NEW,ERR_R_MALLOC_FAILURE); - return(NULL); } + BN_CTX *BN_CTX_new() { BN_CTX *ret; - BIGNUM *n; - int i,j; ret=(BN_CTX *)Malloc(sizeof(BN_CTX)); - if (ret == NULL) goto err2; - - for (i=0; i<BN_CTX_NUM; i++) + if (ret == NULL) { - n=BN_new(); - if (n == NULL) goto err; - ret->bn[i]=n; + BNerr(BN_F_BN_CTX_NEW,ERR_R_MALLOC_FAILURE); + return(NULL); } - /* There is actually an extra one, this is for debugging my - * stuff */ - ret->bn[BN_CTX_NUM]=NULL; - - ret->tos=0; + BN_CTX_init(ret); + ret->flags=BN_FLG_MALLOCED; return(ret); -err: - for (j=0; j<i; j++) - BN_free(ret->bn[j]); - Free(ret); -err2: - BNerr(BN_F_BN_CTX_NEW,ERR_R_MALLOC_FAILURE); - return(NULL); + } + +void BN_CTX_init(ctx) +BN_CTX *ctx; + { + memset(ctx,0,sizeof(BN_CTX)); + ctx->tos=0; + ctx->flags=0; } void BN_CTX_free(c) @@ -275,26 +342,98 @@ BN_CTX *c; int i; for (i=0; i<BN_CTX_NUM; i++) - BN_clear_free(c->bn[i]); - Free(c); + BN_clear_free(&(c->bn[i])); + if (c->flags & BN_FLG_MALLOCED) + Free(c); } BIGNUM *bn_expand2(b, words) BIGNUM *b; int words; { - BN_ULONG *p; + BN_ULONG *A,*B,*a; + int i,j; + + bn_check_top(b); if (words > b->max) { - p=(BN_ULONG *)Realloc(b->d,sizeof(BN_ULONG)*(words+1)); - if (p == NULL) + bn_check_top(b); + if (BN_get_flags(b,BN_FLG_STATIC_DATA)) + { + BNerr(BN_F_BN_EXPAND2,BN_R_EXPAND_ON_STATIC_BIGNUM_DATA); + return(NULL); + } + a=A=(BN_ULONG *)Malloc(sizeof(BN_ULONG)*(words+1)); + if (A == NULL) { BNerr(BN_F_BN_EXPAND2,ERR_R_MALLOC_FAILURE); return(NULL); } - b->d=p; - memset(&(p[b->max]),0,((words+1)-b->max)*sizeof(BN_ULONG)); +memset(A,0x5c,sizeof(BN_ULONG)*(words+1)); +#if 1 + B=b->d; + if (B != NULL) + { + for (i=b->top&(~7); i>0; i-=8) + { + A[0]=B[0]; A[1]=B[1]; A[2]=B[2]; A[3]=B[3]; + A[4]=B[4]; A[5]=B[5]; A[6]=B[6]; A[7]=B[7]; + A+=8; + B+=8; + } + switch (b->top&7) + { + case 7: + A[6]=B[6]; + case 6: + A[5]=B[5]; + case 5: + A[4]=B[4]; + case 4: + A[3]=B[3]; + case 3: + A[2]=B[2]; + case 2: + A[1]=B[1]; + case 1: + A[0]=B[0]; + case 0: + /* I need the 'case 0' entry for utrix cc. + * If the optimiser is turned on, it does the + * switch table by doing + * a=top&7 + * a--; + * goto jump_table[a]; + * If top is 0, this makes us jump to 0xffffffc + * which is rather bad :-(. + * eric 23-Apr-1998 + */ + ; + } + B= &(b->d[b->top]); + j=b->max-8; + for (i=b->top; i<j; i+=8) + { + B[0]=0; B[1]=0; B[2]=0; B[3]=0; + B[4]=0; B[5]=0; B[6]=0; B[7]=0; + B+=8; + } + for (j+=8; i<j; i++) + { + B[0]=0; + B++; + } +#else + memcpy(a->d,b->d,sizeof(b->d[0])*b->top); +#endif + +/* memset(&(p[b->max]),0,((words+1)-b->max)*sizeof(BN_ULONG)); */ +/* { int i; for (i=b->max; i<words+1; i++) p[i]=i;} */ + Free(b->d); + } + + b->d=a; b->max=words; } return(b); @@ -305,6 +444,8 @@ BIGNUM *a; { BIGNUM *r; + bn_check_top(a); + r=BN_new(); if (r == NULL) return(NULL); return((BIGNUM *)BN_copy(r,a)); @@ -317,6 +458,8 @@ BIGNUM *b; int i; BN_ULONG *A,*B; + bn_check_top(b); + if (a == b) return(a); if (bn_wexpand(a,b->top) == NULL) return(NULL); @@ -352,6 +495,18 @@ BIGNUM *b; A[1]=B[1]; case 1: A[0]=B[0]; + case 0: + /* I need the 'case 0' entry for utrix cc. + * If the optimiser is turned on, it does the + * switch table by doing + * a=top&7 + * a--; + * goto jump_table[a]; + * If top is 0, this makes us jump to 0xffffffc which is + * rather bad :-(. + * eric 23-Apr-1998 + */ + ; } #else memcpy(a->d,b->d,sizeof(b->d[0])*b->top); @@ -359,7 +514,7 @@ BIGNUM *b; /* memset(&(a->d[b->top]),0,sizeof(a->d[0])*(a->max-b->top));*/ a->top=b->top; - if (a->top == 0) + if ((a->top == 0) && (a->d != NULL)) a->d[0]=0; a->neg=b->neg; return(a); @@ -368,24 +523,21 @@ BIGNUM *b; void BN_clear(a) BIGNUM *a; { - memset(a->d,0,a->max*sizeof(a->d[0])); + if (a->d != NULL) + memset(a->d,0,a->max*sizeof(a->d[0])); a->top=0; a->neg=0; } -unsigned long BN_get_word(a) +BN_ULONG BN_get_word(a) BIGNUM *a; { int i,n; - unsigned long ret=0; + BN_ULONG ret=0; n=BN_num_bytes(a); - if (n > sizeof(unsigned long)) -#ifdef SIXTY_FOUR_BIT_LONG + if (n > sizeof(BN_ULONG)) return(BN_MASK2); -#else - return(0xFFFFFFFFL); -#endif for (i=a->top-1; i>=0; i--) { #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */ @@ -399,12 +551,12 @@ BIGNUM *a; int BN_set_word(a,w) BIGNUM *a; -unsigned long w; +BN_ULONG w; { int i,n; - if (bn_expand(a,sizeof(unsigned long)*8) == NULL) return(0); + if (bn_expand(a,sizeof(BN_ULONG)*8) == NULL) return(0); - n=sizeof(unsigned long)/BN_BYTES; + n=sizeof(BN_ULONG)/BN_BYTES; a->neg=0; a->top=0; a->d[0]=(BN_ULONG)w&BN_MASK2; @@ -488,6 +640,9 @@ BIGNUM *b; int i; BN_ULONG t1,t2,*ap,*bp; + bn_check_top(a); + bn_check_top(b); + i=a->top-b->top; if (i != 0) return(i); ap=a->d; @@ -519,6 +674,10 @@ BIGNUM *b; else return(0); } + + bn_check_top(a); + bn_check_top(b); + if (a->neg != b->neg) { if (a->neg) @@ -545,13 +704,15 @@ int BN_set_bit(a, n) BIGNUM *a; int n; { - int i,j; + int i,j,k; i=n/BN_BITS2; j=n%BN_BITS2; if (a->top <= i) { - if (bn_expand(a,n) == NULL) return(0); + if (bn_wexpand(a,i+1) == NULL) return(0); + for(k=a->top; k<i+1; k++) + a->d[k]=0; a->top=i+1; } @@ -570,6 +731,7 @@ int n; if (a->top <= i) return(0); a->d[i]&=(~(1L<<j)); + bn_fix_top(a); return(1); } @@ -601,11 +763,27 @@ int n; { a->top=w+1; a->d[w]&= ~(BN_MASK2<<b); - while ((w >= 0) && (a->d[w] == 0)) - { - a->top--; - w--; - } } + bn_fix_top(a); return(1); } + +int bn_cmp_words(a,b,n) +BN_ULONG *a,*b; +int n; + { + int i; + BN_ULONG aa,bb; + + aa=a[n-1]; + bb=b[n-1]; + if (aa != bb) return((aa > bb)?1:-1); + for (i=n-2; i>=0; i--) + { + aa=a[i]; + bb=b[i]; + if (aa != bb) return((aa > bb)?1:-1); + } + return(0); + } + diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c index e435df61f8..e0aa3c769d 100644 --- a/crypto/bn/bn_mont.c +++ b/crypto/bn/bn_mont.c @@ -60,161 +60,208 @@ #include "cryptlib.h" #include "bn_lcl.h" +#define MONT_WORD + int BN_mod_mul_montgomery(r,a,b,mont,ctx) BIGNUM *r,*a,*b; BN_MONT_CTX *mont; BN_CTX *ctx; { - BIGNUM *tmp; + BIGNUM *tmp,*tmp2; + + tmp= &(ctx->bn[ctx->tos]); + tmp2= &(ctx->bn[ctx->tos]); + ctx->tos+=2; - tmp=ctx->bn[ctx->tos++]; + bn_check_top(tmp); + bn_check_top(tmp2); if (a == b) { +#if 0 + bn_wexpand(tmp,a->top*2); + bn_wexpand(tmp2,a->top*4); + bn_sqr_recursive(tmp->d,a->d,a->top,tmp2->d); + tmp->top=a->top*2; + if (tmp->d[tmp->top-1] == 0) + tmp->top--; +#else if (!BN_sqr(tmp,a,ctx)) goto err; +#endif } else { - if (!BN_mul(tmp,a,b)) goto err; + if (!BN_mul(tmp,a,b,ctx)) goto err; } /* reduce from aRR to aR */ if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err; - ctx->tos--; + ctx->tos-=2; return(1); err: return(0); } -#define MONT_WORD - -#ifdef MONT_WORD int BN_from_montgomery(ret,a,mont,ctx) BIGNUM *ret; BIGNUM *a; BN_MONT_CTX *mont; BN_CTX *ctx; { - BIGNUM *n,*t1,*r; - BN_ULONG *ap,*np,*rp,n0,v; - int al,nl,max,i,x,ri; - int retn=0; +#ifdef BN_RECURSION + if (mont->use_word) +#endif + { + BIGNUM *n,*r; + BN_ULONG *ap,*np,*rp,n0,v,*nrp; + int al,nl,max,i,x,ri; + int retn=0; - t1=ctx->bn[ctx->tos]; - r=ctx->bn[ctx->tos+1]; + r= &(ctx->bn[ctx->tos]); - if (!BN_copy(r,a)) goto err; - n=mont->N; + if (!BN_copy(r,a)) goto err1; + n= &(mont->N); - ap=a->d; - /* mont->ri is the size of mont->N in bits/words */ - al=ri=mont->ri/BN_BITS2; + ap=a->d; + /* mont->ri is the size of mont->N in bits/words */ + al=ri=mont->ri/BN_BITS2; - nl=n->top; - if ((al == 0) || (nl == 0)) { r->top=0; return(1); } + nl=n->top; + if ((al == 0) || (nl == 0)) { r->top=0; return(1); } - max=(nl+al+1); /* allow for overflow (no?) XXX */ - if (bn_wexpand(r,max) == NULL) goto err; - if (bn_wexpand(ret,max) == NULL) goto err; + max=(nl+al+1); /* allow for overflow (no?) XXX */ + if (bn_wexpand(r,max) == NULL) goto err1; + if (bn_wexpand(ret,max) == NULL) goto err1; - r->neg=a->neg^n->neg; - np=n->d; - rp=r->d; + r->neg=a->neg^n->neg; + np=n->d; + rp=r->d; + nrp= &(r->d[nl]); - /* clear the top words of T */ + /* clear the top words of T */ #if 1 - for (i=r->top; i<max; i++) /* memset? XXX */ - r->d[i]=0; + for (i=r->top; i<max; i++) /* memset? XXX */ + r->d[i]=0; #else - memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); + memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); #endif - r->top=max; - n0=mont->n0; + r->top=max; + n0=mont->n0; - for (i=0; i<nl; i++) - { -#if 0 - int x1,x2; - - if (i+4 > nl) +#ifdef BN_COUNT +printf("word BN_from_montgomery %d * %d\n",nl,nl); +#endif + for (i=0; i<nl; i++) { - x2=nl; - x1=0; + v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2); + nrp++; + rp++; + if (((nrp[-1]+=v)&BN_MASK2) >= v) + continue; + else + { + if (((++nrp[0])&BN_MASK2) != 0) continue; + if (((++nrp[1])&BN_MASK2) != 0) continue; + for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ; + } } + bn_fix_top(r); + + /* mont->ri will be a multiple of the word size */ +#if 0 + BN_rshift(ret,r,mont->ri); +#else + x=ri; + rp=ret->d; + ap= &(r->d[x]); + if (r->top < x) + al=0; else + al=r->top-x; + ret->top=al; + al-=4; + for (i=0; i<al; i+=4) { - x2=i+4; - x1=nl-x2; + BN_ULONG t1,t2,t3,t4; + + t1=ap[i+0]; + t2=ap[i+1]; + t3=ap[i+2]; + t4=ap[i+3]; + rp[i+0]=t1; + rp[i+1]=t2; + rp[i+2]=t3; + rp[i+3]=t4; } - v=bn_mul_add_words(&(rp[x1]),&(np[x1]),x2,(rp[x1]*n0)&BN_MASK2); -#else - v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2); + al+=4; + for (; i<al; i++) + rp[i]=ap[i]; #endif - if (((rp[nl]+=v)&BN_MASK2) < v) + if (BN_ucmp(ret, &(mont->N)) >= 0) { - for (x=(nl+1); (((++rp[x])&BN_MASK2) == 0); x++) - ; + BN_usub(ret,ret,&(mont->N)); /* XXX */ } - rp++; + retn=1; +err1: + return(retn); } - while (r->d[r->top-1] == 0) - r->top--; - - /* mont->ri will be a multiple of the word size */ -#if 0 - BN_rshift(ret,r,mont->ri); -#else - ap=r->d; - rp=ret->d; - x=ri; - al=r->top-x; - for (i=0; i<al; i++) +#ifdef BN_RECURSION + else /* bignum version */ { - rp[i]=ap[i+x]; - } - ret->top=al; + BIGNUM *t1,*t2,*t3; + int j,i; + +#ifdef BN_COUNT +printf("number BN_from_montgomery\n"); #endif - if (BN_ucmp(ret,mont->N) >= 0) - { - bn_qsub(ret,ret,mont->N); /* XXX */ - } - retn=1; -err: - return(retn); - } -#else -int BN_from_montgomery(r,a,mont,ctx) -BIGNUM *r; -BIGNUM *a; -BN_MONT_CTX *mont; -BN_CTX *ctx; - { - BIGNUM *t1,*t2; + t1= &(ctx->bn[ctx->tos]); + t2= &(ctx->bn[ctx->tos+1]); + t3= &(ctx->bn[ctx->tos+2]); - t1=ctx->bn[ctx->tos]; - t2=ctx->bn[ctx->tos+1]; + i=mont->Ni.top; + bn_wexpand(ret,i); /* perhaps only i*2 */ + bn_wexpand(t1,i*4); /* perhaps only i*2 */ + bn_wexpand(t2,i*2); /* perhaps only i */ - if (!BN_copy(t1,a)) goto err; - /* can cheat */ - BN_mask_bits(t1,mont->ri); + bn_mul_low_recursive(t2->d,a->d,mont->Ni.d,i,t1->d); - if (!BN_mul(t2,t1,mont->Ni)) goto err; - BN_mask_bits(t2,mont->ri); + BN_zero(t3); + BN_set_bit(t3,mont->N.top*BN_BITS2); + bn_sub_words(t3->d,t3->d,a->d,i); + bn_mul_high(ret->d,t2->d,mont->N.d,t3->d,i,t1->d); - if (!BN_mul(t1,t2,mont->N)) goto err; - if (!BN_add(t2,a,t1)) goto err; - BN_rshift(r,t2,mont->ri); + /* hmm... if a is between i and 2*i, things are bad */ + if (a->top > i) + { + j=bn_add_words(ret->d,ret->d,&(a->d[i]),i); + if (j) /* overflow */ + bn_sub_words(ret->d,ret->d,mont->N.d,i); + } + ret->top=i; + bn_fix_top(ret); + if (a->d[0]) + BN_add_word(ret,1); /* Always? */ + else /* Very very rare */ + { + for (i=1; i<mont->N.top-1; i++) + { + if (a->d[i]) + { + BN_add_word(ret,1); /* Always? */ + break; + } + } + } - if (BN_ucmp(r,mont->N) >= 0) - bn_qsub(r,r,mont->N); + if (BN_ucmp(ret,&(mont->N)) >= 0) + BN_usub(ret,ret,&(mont->N)); - return(1); -err: - return(0); - } + return(1); + } #endif + } BN_MONT_CTX *BN_MONT_CTX_new() { @@ -222,25 +269,31 @@ BN_MONT_CTX *BN_MONT_CTX_new() if ((ret=(BN_MONT_CTX *)Malloc(sizeof(BN_MONT_CTX))) == NULL) return(NULL); - ret->ri=0; - ret->RR=BN_new(); - ret->N=BN_new(); - ret->Ni=NULL; - if ((ret->RR == NULL) || (ret->N == NULL)) - { - BN_MONT_CTX_free(ret); - return(NULL); - } + + BN_MONT_CTX_init(ret); + ret->flags=BN_FLG_MALLOCED; return(ret); } +void BN_MONT_CTX_init(ctx) +BN_MONT_CTX *ctx; + { + ctx->use_word=0; + ctx->ri=0; + BN_init(&(ctx->RR)); + BN_init(&(ctx->N)); + BN_init(&(ctx->Ni)); + ctx->flags=0; + } + void BN_MONT_CTX_free(mont) BN_MONT_CTX *mont; { - if (mont->RR != NULL) BN_free(mont->RR); - if (mont->N != NULL) BN_free(mont->N); - if (mont->Ni != NULL) BN_free(mont->Ni); - Free(mont); + BN_free(&(mont->RR)); + BN_free(&(mont->N)); + BN_free(&(mont->Ni)); + if (mont->flags & BN_FLG_MALLOCED) + Free(mont); } int BN_MONT_CTX_set(mont,mod,ctx) @@ -248,59 +301,109 @@ BN_MONT_CTX *mont; BIGNUM *mod; BN_CTX *ctx; { - BIGNUM *Ri=NULL,*R=NULL; - - if (mont->RR == NULL) mont->RR=BN_new(); - if (mont->N == NULL) mont->N=BN_new(); - - R=mont->RR; /* grab RR as a temp */ - BN_copy(mont->N,mod); /* Set N */ - -#ifdef MONT_WORD -{ - BIGNUM tmod; - BN_ULONG buf[2]; - /* int z; */ - - mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; - BN_lshift(R,BN_value_one(),BN_BITS2); /* R */ - /* I was bad, this modification of a passed variable was - * breaking the multithreaded stuff :-( - * z=mod->top; - * mod->top=1; */ - - buf[0]=mod->d[0]; - buf[1]=0; - tmod.d=buf; - tmod.top=1; - tmod.max=mod->max; - tmod.neg=mod->neg; - - if ((Ri=BN_mod_inverse(R,&tmod,ctx)) == NULL) goto err; /* Ri */ - BN_lshift(Ri,Ri,BN_BITS2); /* R*Ri */ - bn_qsub(Ri,Ri,BN_value_one()); /* R*Ri - 1 */ - BN_div(Ri,NULL,Ri,&tmod,ctx); - mont->n0=Ri->d[0]; - BN_free(Ri); - /* mod->top=z; */ -} + BIGNUM Ri,*R; + + BN_init(&Ri); + R= &(mont->RR); /* grab RR as a temp */ + BN_copy(&(mont->N),mod); /* Set N */ + +#ifdef BN_RECURSION + if (mont->N.top < BN_MONT_CTX_SET_SIZE_WORD) +#endif + { + BIGNUM tmod; + BN_ULONG buf[2]; + + mont->use_word=1; + + mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; + BN_zero(R); + BN_set_bit(R,BN_BITS2); + /* I was bad, this modification of a passed variable was + * breaking the multithreaded stuff :-( + * z=mod->top; + * mod->top=1; */ + + buf[0]=mod->d[0]; + buf[1]=0; + tmod.d=buf; + tmod.top=1; + tmod.max=mod->max; + tmod.neg=mod->neg; + + if ((BN_mod_inverse(&Ri,R,&tmod,ctx)) == NULL) + goto err; + BN_lshift(&Ri,&Ri,BN_BITS2); /* R*Ri */ + if (!BN_is_zero(&Ri)) + { +#if 1 + BN_sub_word(&Ri,1); +#else + BN_usub(&Ri,&Ri,BN_value_one()); /* R*Ri - 1 */ +#endif + } + else + { + /* This is not common..., 1 in BN_MASK2, + * It happens when buf[0] was == 1. So for 8 bit, + * this is 1/256, 16bit, 1 in 2^16 etc. + */ + BN_set_word(&Ri,BN_MASK2); + } + BN_div(&Ri,NULL,&Ri,&tmod,ctx); + mont->n0=Ri.d[0]; + BN_free(&Ri); + /* mod->top=z; */ + } +#ifdef BN_RECURSION + else + { + mont->use_word=0; + mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; +#if 1 + BN_zero(R); + BN_set_bit(R,mont->ri); #else - mont->ri=BN_num_bits(mod); - BN_lshift(R,BN_value_one(),mont->ri); /* R */ - if ((Ri=BN_mod_inverse(R,mod,ctx)) == NULL) goto err; /* Ri */ - BN_lshift(Ri,Ri,mont->ri); /* R*Ri */ - bn_qsub(Ri,Ri,BN_value_one()); /* R*Ri - 1 */ - BN_div(Ri,NULL,Ri,mod,ctx); - if (mont->Ni != NULL) BN_free(mont->Ni); - mont->Ni=Ri; /* Ni=(R*Ri-1)/N */ + BN_lshift(R,BN_value_one(),mont->ri); /* R */ +#endif + if ((BN_mod_inverse(&Ri,R,mod,ctx)) == NULL) + goto err; + BN_lshift(&Ri,&Ri,mont->ri); /* R*Ri */ +#if 1 + BN_sub_word(&Ri,1); +#else + BN_usub(&Ri,&Ri,BN_value_one()); /* R*Ri - 1 */ +#endif + BN_div(&(mont->Ni),NULL,&Ri,mod,ctx); + BN_free(&Ri); + } #endif /* setup RR for conversions */ +#if 1 + BN_zero(&(mont->RR)); + BN_set_bit(&(mont->RR),mont->ri*2); +#else BN_lshift(mont->RR,BN_value_one(),mont->ri*2); - BN_mod(mont->RR,mont->RR,mont->N,ctx); +#endif + BN_mod(&(mont->RR),&(mont->RR),&(mont->N),ctx); return(1); err: return(0); } +BN_MONT_CTX *BN_MONT_CTX_copy(to, from) +BN_MONT_CTX *to, *from; + { + if (to == from) return(to); + + BN_copy(&(to->RR),&(from->RR)); + BN_copy(&(to->N),&(from->N)); + BN_copy(&(to->Ni),&(from->Ni)); + to->use_word=from->use_word; + to->ri=from->ri; + to->n0=from->n0; + return(to); + } + diff --git a/crypto/bn/bn_mpi.c b/crypto/bn/bn_mpi.c index 53945c1057..84b0317081 100644 --- a/crypto/bn/bn_mpi.c +++ b/crypto/bn/bn_mpi.c @@ -103,7 +103,7 @@ BIGNUM *a; BNerr(BN_F_BN_MPI2BN,BN_R_INVALID_LENGTH); return(NULL); } - len=(d[0]<<24)|(d[1]<<16)|(d[2]<<8)|d[3]; + len=((long)d[0]<<24)|((long)d[1]<<16)|((int)d[2]<<8)|(int)d[3]; if ((len+4) != n) { BNerr(BN_F_BN_MPI2BN,BN_R_ENCODING_ERROR); diff --git a/crypto/bn/bn_mul.c b/crypto/bn/bn_mul.c index d0c04e1d4b..fc7bf974fd 100644 --- a/crypto/bn/bn_mul.c +++ b/crypto/bn/bn_mul.c @@ -60,150 +60,703 @@ #include "cryptlib.h" #include "bn_lcl.h" -/* r must be different to a and b */ -/* int BN_mmul(r, a, b) */ -int BN_mul(r, a, b) -BIGNUM *r; -BIGNUM *a; -BIGNUM *b; +#ifdef BN_RECURSION +/* r is 2*n2 words in size, + * a and b are both n2 words in size. + * n2 must be a power of 2. + * We multiply and return the result. + * t must be 2*n2 words in size + * We calulate + * a[0]*b[0] + * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0]) + * a[1]*b[1] + */ +void bn_mul_recursive(r,a,b,n2,t) +BN_ULONG *r,*a,*b; +int n2; +BN_ULONG *t; { - int i; - int max,al,bl; - BN_ULONG *ap,*bp,*rp; + int n=n2/2,c1,c2; + unsigned int neg,zero; + BN_ULONG ln,lo,*p; - al=a->top; - bl=b->top; - if ((al == 0) || (bl == 0)) +#ifdef BN_COUNT +printf(" bn_mul_recursive %d * %d\n",n2,n2); +#endif +#ifdef BN_MUL_COMBA +/* if (n2 == 4) { - r->top=0; - return(1); + bn_mul_comba4(r,a,b); + return; + } + else */ if (n2 == 8) + { + bn_mul_comba8(r,a,b); + return; + } +#endif + if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) + { + /* This should not happen */ + bn_mul_normal(r,a,n2,b,n2); + return; + } + /* r=(a[0]-a[1])*(b[1]-b[0]) */ + c1=bn_cmp_words(a,&(a[n]),n); + c2=bn_cmp_words(&(b[n]),b,n); + zero=neg=0; + switch (c1*3+c2) + { + case -4: + bn_sub_words(t, &(a[n]),a, n); /* - */ + bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ + break; + case -3: + zero=1; + break; + case -2: + bn_sub_words(t, &(a[n]),a, n); /* - */ + bn_sub_words(&(t[n]),&(b[n]),b, n); /* + */ + neg=1; + break; + case -1: + case 0: + case 1: + zero=1; + break; + case 2: + bn_sub_words(t, a, &(a[n]),n); /* + */ + bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ + neg=1; + break; + case 3: + zero=1; + break; + case 4: + bn_sub_words(t, a, &(a[n]),n); + bn_sub_words(&(t[n]),&(b[n]),b, n); + break; } - max=(al+bl); - if (bn_wexpand(r,max) == NULL) return(0); - r->top=max; - r->neg=a->neg^b->neg; - ap=a->d; - bp=b->d; - rp=r->d; +#ifdef BN_MUL_COMBA + if (n == 4) + { + if (!zero) + bn_mul_comba4(&(t[n2]),t,&(t[n])); + else + memset(&(t[n2]),0,8*sizeof(BN_ULONG)); + + bn_mul_comba4(r,a,b); + bn_mul_comba4(&(r[n2]),&(a[n]),&(b[n])); + } + else if (n == 8) + { + if (!zero) + bn_mul_comba8(&(t[n2]),t,&(t[n])); + else + memset(&(t[n2]),0,16*sizeof(BN_ULONG)); + + bn_mul_comba8(r,a,b); + bn_mul_comba8(&(r[n2]),&(a[n]),&(b[n])); + } + else +#endif + { + p= &(t[n2*2]); + if (!zero) + bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p); + else + memset(&(t[n2]),0,n2*sizeof(BN_ULONG)); + bn_mul_recursive(r,a,b,n,p); + bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,p); + } - rp[al]=bn_mul_words(rp,ap,al,*(bp++)); - rp++; - for (i=1; i<bl; i++) + /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + */ + + c1=bn_add_words(t,r,&(r[n2]),n2); + + if (neg) /* if t[32] is negative */ { - rp[al]=bn_mul_add_words(rp,ap,al,*(bp++)); - rp++; + c1-=bn_sub_words(&(t[n2]),t,&(t[n2]),n2); + } + else + { + /* Might have a carry */ + c1+=bn_add_words(&(t[n2]),&(t[n2]),t,n2); } - if (r->d[max-1] == 0) r->top--; - return(1); - } -#if 0 -#include "stack.h" + /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1]) + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + * c1 holds the carry bits + */ + c1+=bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2); + if (c1) + { + p= &(r[n+n2]); + lo= *p; + ln=(lo+c1)&BN_MASK2; + *p=ln; -int limit=16; + /* The overflow will stop before we over write + * words we should not overwrite */ + if (ln < (BN_ULONG)c1) + { + do { + p++; + lo= *p; + ln=(lo+1)&BN_MASK2; + *p=ln; + } while (ln == 0); + } + } + } -typedef struct bn_pool_st +/* n+tn is the word length + * t needs to be n*4 is size, as does r */ +void bn_mul_part_recursive(r,a,b,tn,n,t) +BN_ULONG *r,*a,*b; +int tn,n; +BN_ULONG *t; { - int used; - int tos; - STACK *sk; - } BN_POOL; + int i,j,n2=n*2; + unsigned int c1; + BN_ULONG ln,lo,*p; -BIGNUM *BN_POOL_push(bp) -BN_POOL *bp; - { - BIGNUM *ret; +#ifdef BN_COUNT +printf(" bn_mul_part_recursive %d * %d\n",tn+n,tn+n); +#endif + if (n < 8) + { + i=tn+n; + bn_mul_normal(r,a,i,b,i); + return; + } + + /* r=(a[0]-a[1])*(b[1]-b[0]) */ + bn_sub_words(t, a, &(a[n]),n); /* + */ + bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ - if (bp->used >= bp->tos) +/* if (n == 4) + { + bn_mul_comba4(&(t[n2]),t,&(t[n])); + bn_mul_comba4(r,a,b); + bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn); + memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2)); + } + else */ if (n == 8) { - ret=BN_new(); - sk_push(bp->sk,(char *)ret); - bp->tos++; - bp->used++; + bn_mul_comba8(&(t[n2]),t,&(t[n])); + bn_mul_comba8(r,a,b); + bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn); + memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2)); } else { - ret=(BIGNUM *)sk_value(bp->sk,bp->used); - bp->used++; + p= &(t[n2*2]); + bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p); + bn_mul_recursive(r,a,b,n,p); + i=n/2; + /* If there is only a bottom half to the number, + * just do it */ + j=tn-i; + if (j == 0) + { + bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),i,p); + memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2)); + } + else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */ + { + bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]), + j,i,p); + memset(&(r[n2+tn*2]),0, + sizeof(BN_ULONG)*(n2-tn*2)); + } + else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */ + { + memset(&(r[n2]),0,sizeof(BN_ULONG)*n2); + if (tn < BN_MUL_RECURSIVE_SIZE_NORMAL) + { + bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn); + } + else + { + for (;;) + { + i/=2; + if (i < tn) + { + bn_mul_part_recursive(&(r[n2]), + &(a[n]),&(b[n]), + tn-i,i,p); + break; + } + else if (i == tn) + { + bn_mul_recursive(&(r[n2]), + &(a[n]),&(b[n]), + i,p); + break; + } + } + } + } + } + + /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + */ + + c1=bn_add_words(t,r,&(r[n2]),n2); + c1-=bn_sub_words(&(t[n2]),t,&(t[n2]),n2); + + /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1]) + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + * c1 holds the carry bits + */ + c1+=bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2); + if (c1) + { + p= &(r[n+n2]); + lo= *p; + ln=(lo+c1)&BN_MASK2; + *p=ln; + + /* The overflow will stop before we over write + * words we should not overwrite */ + if (ln < c1) + { + do { + p++; + lo= *p; + ln=(lo+1)&BN_MASK2; + *p=ln; + } while (ln == 0); + } } - return(ret); } -void BN_POOL_pop(bp,num) -BN_POOL *bp; -int num; +/* a and b must be the same size, which is n2. + * r needs to be n2 words and t needs to be n2*2 + */ +void bn_mul_low_recursive(r,a,b,n2,t) +BN_ULONG *r,*a,*b; +int n2; +BN_ULONG *t; { - bp->used-=num; + int n=n2/2; + +#ifdef BN_COUNT +printf(" bn_mul_low_recursive %d * %d\n",n2,n2); +#endif + + bn_mul_recursive(r,a,b,n,&(t[0])); + if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL) + { + bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2])); + bn_add_words(&(r[n]),&(r[n]),&(t[0]),n); + bn_mul_low_recursive(&(t[0]),&(a[n]),&(b[0]),n,&(t[n2])); + bn_add_words(&(r[n]),&(r[n]),&(t[0]),n); + } + else + { + bn_mul_low_normal(&(t[0]),&(a[0]),&(b[n]),n); + bn_mul_low_normal(&(t[n]),&(a[n]),&(b[0]),n); + bn_add_words(&(r[n]),&(r[n]),&(t[0]),n); + bn_add_words(&(r[n]),&(r[n]),&(t[n]),n); + } } -int BN_mul(r,a,b) -BIGNUM *r,*a,*b; +/* a and b must be the same size, which is n2. + * r needs to be n2 words and t needs to be n2*2 + * l is the low words of the output. + * t needs to be n2*3 + */ +void bn_mul_high(r,a,b,l,n2,t) +BN_ULONG *r,*a,*b,*l; +int n2; +BN_ULONG *t; { - static BN_POOL bp; - static init=1; + int i,n; + int c1,c2; + int neg,oneg,zero; + BN_ULONG ll,lc,*lp,*mp; + +#ifdef BN_COUNT +printf(" bn_mul_high %d * %d\n",n2,n2); +#endif + n=(n2+1)/2; + + /* Calculate (al-ah)*(bh-bl) */ + neg=zero=0; + c1=bn_cmp_words(&(a[0]),&(a[n]),n); + c2=bn_cmp_words(&(b[n]),&(b[0]),n); + switch (c1*3+c2) + { + case -4: + bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n); + bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n); + break; + case -3: + zero=1; + break; + case -2: + bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n); + bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n); + neg=1; + break; + case -1: + case 0: + case 1: + zero=1; + break; + case 2: + bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n); + bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n); + neg=1; + break; + case 3: + zero=1; + break; + case 4: + bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n); + bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n); + break; + } + + oneg=neg; + /* t[10] = (a[0]-a[1])*(b[1]-b[0]) */ + /* r[10] = (a[1]*b[1]) */ +#ifdef BN_MUL_COMBA + if (n == 8) + { + bn_mul_comba8(&(t[0]),&(r[0]),&(r[n])); + bn_mul_comba8(r,&(a[n]),&(b[n])); + } + else +#endif + { + bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,&(t[n2])); + bn_mul_recursive(r,&(a[n]),&(b[n]),n,&(t[n2])); + } - if (init) + /* s0 == low(al*bl) + * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl) + * We know s0 and s1 so the only unknown is high(al*bl) + * high(al*bl) == s1 - low(ah*bh+s0+(al-ah)*(bh-bl)) + * high(al*bl) == s1 - (r[0]+l[0]+t[0]) + */ + if (l != NULL) { - bp.used=0; - bp.tos=0; - bp.sk=sk_new_null(); - init=0; + lp= &(t[n2+n]); + c1=bn_add_words(lp,&(r[0]),&(l[0]),n); + } + else + { + c1=0; + lp= &(r[0]); + } + + if (neg) + neg=bn_sub_words(&(t[n2]),lp,&(t[0]),n); + else + { + bn_add_words(&(t[n2]),lp,&(t[0]),n); + neg=0; + } + + if (l != NULL) + { + bn_sub_words(&(t[n2+n]),&(l[n]),&(t[n2]),n); + } + else + { + lp= &(t[n2+n]); + mp= &(t[n2]); + for (i=0; i<n; i++) + lp[i]=((~mp[i])+1)&BN_MASK2; + } + + /* s[0] = low(al*bl) + * t[3] = high(al*bl) + * t[10] = (a[0]-a[1])*(b[1]-b[0]) neg is the sign + * r[10] = (a[1]*b[1]) + */ + /* R[10] = al*bl + * R[21] = al*bl + ah*bh + (a[0]-a[1])*(b[1]-b[0]) + * R[32] = ah*bh + */ + /* R[1]=t[3]+l[0]+r[0](+-)t[0] (have carry/borrow) + * R[2]=r[0]+t[3]+r[1](+-)t[1] (have carry/borrow) + * R[3]=r[1]+(carry/borrow) + */ + if (l != NULL) + { + lp= &(t[n2]); + c1= bn_add_words(lp,&(t[n2+n]),&(l[0]),n); + } + else + { + lp= &(t[n2+n]); + c1=0; + } + c1+=bn_add_words(&(t[n2]),lp, &(r[0]),n); + if (oneg) + c1-=bn_sub_words(&(t[n2]),&(t[n2]),&(t[0]),n); + else + c1+=bn_add_words(&(t[n2]),&(t[n2]),&(t[0]),n); + + c2 =bn_add_words(&(r[0]),&(r[0]),&(t[n2+n]),n); + c2+=bn_add_words(&(r[0]),&(r[0]),&(r[n]),n); + if (oneg) + c2-=bn_sub_words(&(r[0]),&(r[0]),&(t[n]),n); + else + c2+=bn_add_words(&(r[0]),&(r[0]),&(t[n]),n); + + if (c1 != 0) /* Add starting at r[0], could be +ve or -ve */ + { + i=0; + if (c1 > 0) + { + lc=c1; + do { + ll=(r[i]+lc)&BN_MASK2; + r[i++]=ll; + lc=(lc > ll); + } while (lc); + } + else + { + lc= -c1; + do { + ll=r[i]; + r[i++]=(ll-lc)&BN_MASK2; + lc=(lc > ll); + } while (lc); + } + } + if (c2 != 0) /* Add starting at r[1] */ + { + i=n; + if (c2 > 0) + { + lc=c2; + do { + ll=(r[i]+lc)&BN_MASK2; + r[i++]=ll; + lc=(lc > ll); + } while (lc); + } + else + { + lc= -c2; + do { + ll=r[i]; + r[i++]=(ll-lc)&BN_MASK2; + lc=(lc > ll); + } while (lc); + } } - return(BN_mm(r,a,b,&bp)); } +#endif -/* r must be different to a and b */ -int BN_mm(m, A, B, bp) -BIGNUM *m,*A,*B; -BN_POOL *bp; +int BN_mul(r,a,b,ctx) +BIGNUM *r,*a,*b; +BN_CTX *ctx; { - int i,num; - int an,bn; - BIGNUM *a,*b,*c,*d,*ac,*bd; + int top,i,j,k,al,bl; + BIGNUM *t; + + t=NULL; + i=j=k=0; + +#ifdef BN_COUNT +printf("BN_mul %d * %d\n",a->top,b->top); +#endif + + bn_check_top(a); + bn_check_top(b); + bn_check_top(r); - an=A->top; - bn=B->top; - if ((an <= limit) || (bn <= limit)) + al=a->top; + bl=b->top; + r->neg=a->neg^b->neg; + + if ((al == 0) || (bl == 0)) { - return(BN_mmul(m,A,B)); + BN_zero(r); + return(1); } + top=al+bl; +#if defined(BN_MUL_COMBA) || defined(BN_RECURSION) + if (al == bl) + { +# ifdef BN_MUL_COMBA +/* if (al == 4) + { + if (bn_wexpand(r,8) == NULL) return(0); + r->top=8; + bn_mul_comba4(r->d,a->d,b->d); + goto end; + } + else */ if (al == 8) + { + if (bn_wexpand(r,16) == NULL) return(0); + r->top=16; + bn_mul_comba8(r->d,a->d,b->d); + goto end; + } + else +# endif +#ifdef BN_RECURSION + if (al < BN_MULL_SIZE_NORMAL) +#endif + { + if (bn_wexpand(r,top) == NULL) return(0); + r->top=top; + bn_mul_normal(r->d,a->d,al,b->d,bl); + goto end; + } +# ifdef BN_RECURSION + goto symetric; +# endif + } +#endif +#ifdef BN_RECURSION + else if ((al < BN_MULL_SIZE_NORMAL) || (bl < BN_MULL_SIZE_NORMAL)) + { + if (bn_wexpand(r,top) == NULL) return(0); + r->top=top; + bn_mul_normal(r->d,a->d,al,b->d,bl); + goto end; + } + else + { + i=(al-bl); + if ((i == 1) && !BN_get_flags(b,BN_FLG_STATIC_DATA)) + { + bn_wexpand(b,al); + b->d[bl]=0; + bl++; + goto symetric; + } + else if ((i == -1) && !BN_get_flags(a,BN_FLG_STATIC_DATA)) + { + bn_wexpand(a,bl); + a->d[al]=0; + al++; + goto symetric; + } + } +#endif - a=BN_POOL_push(bp); - b=BN_POOL_push(bp); - c=BN_POOL_push(bp); - d=BN_POOL_push(bp); - ac=BN_POOL_push(bp); - bd=BN_POOL_push(bp); + /* asymetric and >= 4 */ + if (bn_wexpand(r,top) == NULL) return(0); + r->top=top; + bn_mul_normal(r->d,a->d,al,b->d,bl); - num=(an <= bn)?an:bn; - num=1<<(BN_num_bits_word(num-1)-1); +#ifdef BN_RECURSION + if (0) + { +symetric: + /* symetric and > 4 */ + /* 16 or larger */ + j=BN_num_bits_word((BN_ULONG)al); + j=1<<(j-1); + k=j+j; + t= &(ctx->bn[ctx->tos]); + if (al == j) /* exact multiple */ + { + bn_wexpand(t,k*2); + bn_wexpand(r,k*2); + bn_mul_recursive(r->d,a->d,b->d,al,t->d); + } + else + { + bn_wexpand(a,k); + bn_wexpand(b,k); + bn_wexpand(t,k*4); + bn_wexpand(r,k*4); + for (i=a->top; i<k; i++) + a->d[i]=0; + for (i=b->top; i<k; i++) + b->d[i]=0; + bn_mul_part_recursive(r->d,a->d,b->d,al-j,j,t->d); + } + r->top=top; + } +#endif +end: + bn_fix_top(r); + return(1); + } - /* Are going to now chop things into 'num' word chunks. */ - num*=BN_BITS2; +void bn_mul_normal(r,a,na,b,nb) +BN_ULONG *r,*a; +int na; +BN_ULONG *b; +int nb; + { + BN_ULONG *rr; - BN_copy(a,A); - BN_mask_bits(a,num); - BN_rshift(b,A,num); +#ifdef BN_COUNT +printf(" bn_mul_normal %d * %d\n",na,nb); +#endif - BN_copy(c,B); - BN_mask_bits(c,num); - BN_rshift(d,B,num); + if (na < nb) + { + int itmp; + BN_ULONG *ltmp; - BN_sub(ac ,b,a); - BN_sub(bd,c,d); - BN_mm(m,ac,bd,bp); - BN_mm(ac,a,c,bp); - BN_mm(bd,b,d,bp); + itmp=na; na=nb; nb=itmp; + ltmp=a; a=b; b=ltmp; - BN_add(m,m,ac); - BN_add(m,m,bd); - BN_lshift(m,m,num); - BN_lshift(bd,bd,num*2); + } + rr= &(r[na]); + rr[0]=bn_mul_words(r,a,na,b[0]); - BN_add(m,m,ac); - BN_add(m,m,bd); - BN_POOL_pop(bp,6); - return(1); + for (;;) + { + if (--nb <= 0) return; + rr[1]=bn_mul_add_words(&(r[1]),a,na,b[1]); + if (--nb <= 0) return; + rr[2]=bn_mul_add_words(&(r[2]),a,na,b[2]); + if (--nb <= 0) return; + rr[3]=bn_mul_add_words(&(r[3]),a,na,b[3]); + if (--nb <= 0) return; + rr[4]=bn_mul_add_words(&(r[4]),a,na,b[4]); + rr+=4; + r+=4; + b+=4; + } } + +void bn_mul_low_normal(r,a,b,n) +BN_ULONG *r,*a,*b; +int n; + { +#ifdef BN_COUNT +printf(" bn_mul_low_normal %d * %d\n",n,n); #endif + bn_mul_words(r,a,n,b[0]); + + for (;;) + { + if (--n <= 0) return; + bn_mul_add_words(&(r[1]),a,n,b[1]); + if (--n <= 0) return; + bn_mul_add_words(&(r[2]),a,n,b[2]); + if (--n <= 0) return; + bn_mul_add_words(&(r[3]),a,n,b[3]); + if (--n <= 0) return; + bn_mul_add_words(&(r[4]),a,n,b[4]); + r+=4; + b+=4; + } + } + diff --git a/crypto/bn/bn_opts.c b/crypto/bn/bn_opts.c new file mode 100644 index 0000000000..86a03e2423 --- /dev/null +++ b/crypto/bn/bn_opts.c @@ -0,0 +1,342 @@ +/* crypto/bn/expspeed.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +/* most of this code has been pilfered from my libdes speed.c program */ + +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include "crypto.h" +#include "tmdiff.h" +#include "bn.h" +#include "err.h" + +#define DEFAULT_SIZE 512 +#define DEFAULT_TIME 3 + +int verbose=1; + +typedef struct parms_st + { + char *name; + void (*func)(); + BIGNUM r; + BIGNUM a; + BIGNUM b; + BIGNUM c; + BIGNUM low; + BN_CTX *ctx; + BN_MONT_CTX *mont; + int w; + } PARMS; + +void do_mul_exp(int num,PARMS *p); +void do_mul(int num,PARMS *p); +void do_sqr(int num,PARMS *p); +void do_mul_low(int num,PARMS *p); +void do_mul_high(int num,PARMS *p); +void do_from_montgomery(int num,PARMS *p); +int time_it(int sec, PARMS *p); +void do_it(int sec, PARMS *p); + +#define P_EXP 1 +#define P_MUL 2 +#define P_SQR 3 +#define P_MULL 4 +#define P_MULH 5 +#define P_MRED 6 + +int main(argc,argv) +int argc; +char **argv; + { + PARMS p; + BN_MONT_CTX *mont; + int size=0,num; + char *name; + int type=P_EXP; + + mont=BN_MONT_CTX_new(); + p.mont=NULL; + p.ctx=BN_CTX_new(); + BN_init(&p.r); + BN_init(&p.a); + BN_init(&p.b); + BN_init(&p.c); + BN_init(&p.low); + p.w=0; + + for (;;) + { + if (argc > 1) + { + if (argv[1][0] == '-') + { + switch(argv[1][1]) + { + case 'e': type=P_EXP; break; + case 'm': type=P_MUL; break; + case 's': type=P_SQR; break; + case 'l': type=P_MULL; break; + case 'h': type=P_MULH; break; + case 'r': type=P_MRED; break; + default: + fprintf(stderr,"options: -[emslhr]\n"); + exit(1); + } + } + else + { + size=atoi(argv[1]); + } + argc--; + argv++; + } + else + break; + } + if (size == 0) + size=DEFAULT_SIZE; + + printf("bit size:%5d\n",size); + + BN_rand(&p.a,size,1,0); + BN_rand(&p.b,size,1,0); + BN_rand(&p.c,size,1,1); + BN_mod(&p.a,&p.a,&p.c,p.ctx); + BN_mod(&p.b,&p.b,&p.c,p.ctx); + p.w=(p.a.top+1)/2; + + BN_mul(&p.low,&p.a,&p.b,p.ctx); + p.low.top=p.a.top; + + switch(type) + { + case P_EXP: + p.name="r=a^b%c"; + p.func=do_mul_exp; + p.mont=mont; + break; + case P_MUL: + p.name="r=a*b"; + p.func=do_mul; + break; + case P_SQR: + p.name="r=a*a"; + p.func=do_sqr; + break; + case P_MULL: + p.name="r=low(a*b)"; + p.func=do_mul_low; + break; + case P_MULH: + p.name="r=high(a*b)"; + p.func=do_mul_high; + break; + case P_MRED: + p.name="r=montgomery_reduction(a)"; + p.func=do_from_montgomery; + p.mont=mont; + break; + default: + fprintf(stderr,"options: -[emslhr]\n"); + exit(1); + } + + num=time_it(DEFAULT_TIME,&p); + do_it(num,&p); + } + +void do_it(num,p) +int num; +PARMS *p; + { + char *start,*end; + int i,j,number; + double d; + + start=ms_time_new(); + end=ms_time_new(); + + number=BN_num_bits_word((BN_ULONG)BN_num_bits(&(p->c)))- + BN_num_bits_word(BN_BITS2)+2; + for (i=number-1; i >=0; i--) + { + if (i == 1) continue; + BN_set_params(i,i,i,1); + if (p->mont != NULL) + BN_MONT_CTX_set(p->mont,&(p->c),p->ctx); + + printf("Timing %5d (%2d bit) %2d %2d %2d %2d :", + (1<<i)*BN_BITS2,i, + BN_get_params(0), + BN_get_params(1), + BN_get_params(2), + BN_get_params(3)); + fflush(stdout); + + ms_time_get(start); + p->func(num,p); + ms_time_get(end); + d=ms_time_diff(start,end); + printf("%6.6f sec, or %d in %.4f seconds\n", + (double)d/num,num,d); + } + } + +int time_it(sec,p) +int sec; +PARMS *p; + { + char *start,*end; + int i,j; + double d; + + if (p->mont != NULL) + BN_MONT_CTX_set(p->mont,&(p->c),p->ctx); + + start=ms_time_new(); + end=ms_time_new(); + + i=1; + for (;;) + { + if (verbose) + printf("timing %s for %d interations\n",p->name,i); + + ms_time_get(start); + p->func(i,p); + ms_time_get(end); + d=ms_time_diff(start,end); + + if (d < 0.01) i*=100; + else if (d < 0.1 ) i*=10; + else if (d > (double)sec) break; + else + { + i=(int)(1.0*i*sec/d); + break; + } + } + if (verbose) + printf("using %d interations\n",i); + return(i); + } + +void do_mul_exp(num,p) +int num; +PARMS *p; + { + int i; + + for (i=0; i<num; i++) + BN_mod_exp_mont(&(p->r),&(p->a),&(p->b),&(p->c), + p->ctx,p->mont); + } + +void do_mul(num,p) +int num; +PARMS *p; + { + int i; + + for (i=0; i<num; i++) + BN_mul(&(p->r),&(p->a),&(p->b),p->ctx); + } + +void do_sqr(num,p) +int num; +PARMS *p; + { + int i; + + for (i=0; i<num; i++) + BN_sqr(&(p->r),&(p->a),p->ctx); + } + +void do_mul_low(num,p) +int num; +PARMS *p; + { + int i; + + for (i=0; i<num; i++) + BN_mul_low(&(p->r),&(p->a),&(p->b),p->w,p->ctx); + } + +void do_mul_high(num,p) +int num; +PARMS *p; + { + int i; + + for (i=0; i<num; i++) + BN_mul_low(&(p->r),&(p->a),&(p->b),&(p->low),p->w,p->ctx); + } + +void do_from_montgomery(num,p) +int num; +PARMS *p; + { + int i; + + for (i=0; i<num; i++) + BN_from_montgomery(&(p->r),&(p->a),p->mont,p->ctx); + } + diff --git a/crypto/bn/bn_prime.c b/crypto/bn/bn_prime.c index 0c85f70b59..c4fb58ef9a 100644 --- a/crypto/bn/bn_prime.c +++ b/crypto/bn/bn_prime.c @@ -83,7 +83,8 @@ static int probable_prime_dh(); static int probable_prime_dh_strong(); #endif -BIGNUM *BN_generate_prime(bits,strong,add,rem,callback,cb_arg) +BIGNUM *BN_generate_prime(ret,bits,strong,add,rem,callback,cb_arg) +BIGNUM *ret; int bits; int strong; BIGNUM *add; @@ -92,16 +93,19 @@ void (*callback)(P_I_I_P); char *cb_arg; { BIGNUM *rnd=NULL; - BIGNUM *ret=NULL; - BIGNUM *t=NULL; + BIGNUM t; int i,j,c1=0; BN_CTX *ctx; ctx=BN_CTX_new(); if (ctx == NULL) goto err; - if ((rnd=BN_new()) == NULL) goto err; - if (strong) - if ((t=BN_new()) == NULL) goto err; + if (ret == NULL) + { + if ((rnd=BN_new()) == NULL) goto err; + } + else + rnd=ret; + BN_init(&t); loop: /* make a random number and set the top and bottom bits */ if (add == NULL) @@ -136,7 +140,7 @@ loop: * check that (p-1)/2 is prime. * Since a prime is odd, We just * need to divide by 2 */ - if (!BN_rshift1(t,rnd)) goto err; + if (!BN_rshift1(&t,rnd)) goto err; for (i=0; i<BN_prime_checks; i++) { @@ -144,7 +148,7 @@ loop: if (j == -1) goto err; if (j == 0) goto loop; - j=BN_is_prime(t,1,callback,ctx,cb_arg); + j=BN_is_prime(&t,1,callback,ctx,cb_arg); if (j == -1) goto err; if (j == 0) goto loop; @@ -156,7 +160,7 @@ loop: ret=rnd; err: if ((ret == NULL) && (rnd != NULL)) BN_free(rnd); - if (t != NULL) BN_free(t); + BN_free(&t); if (ctx != NULL) BN_CTX_free(ctx); return(ret); } @@ -183,7 +187,7 @@ char *cb_arg; if ((ctx2=BN_CTX_new()) == NULL) goto err; if ((mont=BN_MONT_CTX_new()) == NULL) goto err; - check=ctx->bn[ctx->tos++]; + check= &(ctx->bn[ctx->tos++]); /* Setup the montgomery structure */ if (!BN_MONT_CTX_set(mont,a,ctx2)) goto err; @@ -224,14 +228,14 @@ BN_MONT_CTX *mont; BIGNUM *d,*dd,*tmp,*d1,*d2,*n1; BIGNUM *mont_one,*mont_n1,*mont_a; - d1=ctx->bn[ctx->tos]; - d2=ctx->bn[ctx->tos+1]; - n1=ctx->bn[ctx->tos+2]; + d1= &(ctx->bn[ctx->tos]); + d2= &(ctx->bn[ctx->tos+1]); + n1= &(ctx->bn[ctx->tos+2]); ctx->tos+=3; - mont_one=ctx2->bn[ctx2->tos]; - mont_n1=ctx2->bn[ctx2->tos+1]; - mont_a=ctx2->bn[ctx2->tos+2]; + mont_one= &(ctx2->bn[ctx2->tos]); + mont_n1= &(ctx2->bn[ctx2->tos+1]); + mont_a= &(ctx2->bn[ctx2->tos+2]); ctx2->tos+=3; d=d1; @@ -287,8 +291,9 @@ int bits; { int i; MS_STATIC BN_ULONG mods[NUMPRIMES]; - BN_ULONG delta; + BN_ULONG delta,d; +again: if (!BN_rand(rnd,bits,1,1)) return(0); /* we now have a random number 'rand' to test. */ for (i=1; i<NUMPRIMES; i++) @@ -300,9 +305,12 @@ int bits; * that gcd(rnd-1,primes) == 1 (except for 2) */ if (((mods[i]+delta)%primes[i]) <= 1) { + d=delta; delta+=2; /* perhaps need to check for overflow of - * delta (but delta can be upto 2^32) */ + * delta (but delta can be upto 2^32) + * 21-May-98 eay - added overflow check */ + if (delta < d) goto again; goto loop; } } @@ -320,7 +328,7 @@ BN_CTX *ctx; int i,ret=0; BIGNUM *t1; - t1=ctx->bn[ctx->tos++]; + t1= &(ctx->bn[ctx->tos++]); if (!BN_rand(rnd,bits,0,1)) goto err; @@ -361,9 +369,9 @@ BN_CTX *ctx; BIGNUM *t1,*qadd=NULL,*q=NULL; bits--; - t1=ctx->bn[ctx->tos++]; - q=ctx->bn[ctx->tos++]; - qadd=ctx->bn[ctx->tos++]; + t1= &(ctx->bn[ctx->tos++]); + q= &(ctx->bn[ctx->tos++]); + qadd= &(ctx->bn[ctx->tos++]); if (!BN_rshift1(qadd,padd)) goto err; @@ -413,11 +421,11 @@ BN_CTX *ctx; BIGNUM *d,*dd,*tmp; BIGNUM *d1,*d2,*x,*n1,*inv; - d1=ctx->bn[ctx->tos]; - d2=ctx->bn[ctx->tos+1]; - x=ctx->bn[ctx->tos+2]; - n1=ctx->bn[ctx->tos+3]; - inv=ctx->bn[ctx->tos+4]; + d1= &(ctx->bn[ctx->tos]); + d2= &(ctx->bn[ctx->tos+1]); + x= &(ctx->bn[ctx->tos+2]); + n1= &(ctx->bn[ctx->tos+3]); + inv=&(ctx->bn[ctx->tos+4]); ctx->tos+=5; d=d1; diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c index 72cd69d3fc..97ca857ed1 100644 --- a/crypto/bn/bn_recp.c +++ b/crypto/bn/bn_recp.c @@ -60,66 +60,182 @@ #include "cryptlib.h" #include "bn_lcl.h" -int BN_mod_mul_reciprocal(r, x, y, m, i, nb, ctx) +void BN_RECP_CTX_init(recp) +BN_RECP_CTX *recp; + { + BN_init(&(recp->N)); + BN_init(&(recp->Nr)); + recp->num_bits=0; + recp->flags=0; + } + +BN_RECP_CTX *BN_RECP_CTX_new() + { + BN_RECP_CTX *ret; + + if ((ret=(BN_RECP_CTX *)Malloc(sizeof(BN_RECP_CTX))) == NULL) + return(NULL); + + BN_RECP_CTX_init(ret); + ret->flags=BN_FLG_MALLOCED; + return(ret); + } + +void BN_RECP_CTX_free(recp) +BN_RECP_CTX *recp; + { + BN_free(&(recp->N)); + BN_free(&(recp->Nr)); + if (recp->flags & BN_FLG_MALLOCED) + Free(recp); + } + +int BN_RECP_CTX_set(recp,d,ctx) +BN_RECP_CTX *recp; +BIGNUM *d; +BN_CTX *ctx; + { + BN_copy(&(recp->N),d); + BN_zero(&(recp->Nr)); + recp->num_bits=BN_num_bits(d); + recp->shift=0; + return(1); + } + +int BN_mod_mul_reciprocal(r, x, y, recp, ctx) BIGNUM *r; BIGNUM *x; BIGNUM *y; +BN_RECP_CTX *recp; +BN_CTX *ctx; + { + int ret=0; + BIGNUM *a; + + a= &(ctx->bn[ctx->tos++]); + if (y != NULL) + { + if (x == y) + { if (!BN_sqr(a,x,ctx)) goto err; } + else + { if (!BN_mul(a,x,y,ctx)) goto err; } + } + else + a=x; /* Just do the mod */ + + BN_div_recp(NULL,r,a,recp,ctx); + ret=1; +err: + ctx->tos--; + return(ret); + } + +int BN_div_recp(dv,rem,m,recp,ctx) +BIGNUM *dv; +BIGNUM *rem; BIGNUM *m; -BIGNUM *i; -int nb; +BN_RECP_CTX *recp; BN_CTX *ctx; { - int ret=0,j; - BIGNUM *a,*b,*c,*d; + int i,j,tos,ret=0,ex; + BIGNUM *a,*b,*d,*r; + + tos=ctx->tos; + a= &(ctx->bn[ctx->tos++]); + b= &(ctx->bn[ctx->tos++]); + if (dv != NULL) + d=dv; + else + d= &(ctx->bn[ctx->tos++]); + if (rem != NULL) + r=rem; + else + r= &(ctx->bn[ctx->tos++]); + + if (BN_ucmp(m,&(recp->N)) < 0) + { + BN_zero(d); + BN_copy(r,m); + ctx->tos=tos; + return(1); + } - a=ctx->bn[ctx->tos++]; - b=ctx->bn[ctx->tos++]; - c=ctx->bn[ctx->tos++]; - d=ctx->bn[ctx->tos++]; + /* We want the remainder + * Given input of ABCDEF / ab + * we need multiply ABCDEF by 3 digests of the reciprocal of ab + * + */ + i=BN_num_bits(m); - if (x == y) - { if (!BN_sqr(a,x,ctx)) goto err; } + j=recp->num_bits*2; + if (j > i) + { + i=j; + ex=0; + } else - { if (!BN_mul(a,x,y)) goto err; } - if (!BN_rshift(d,a,nb)) goto err; - if (!BN_mul(b,d,i)) goto err; - if (!BN_rshift(c,b,nb)) goto err; - if (!BN_mul(b,m,c)) goto err; - if (!BN_sub(r,a,b)) goto err; + { + ex=(i-j)/2; + } + + j=i/2; + + if (i != recp->shift) + recp->shift=BN_reciprocal(&(recp->Nr),&(recp->N), + i,ctx); + + if (!BN_rshift(a,m,j-ex)) goto err; + if (!BN_mul(b,a,&(recp->Nr),ctx)) goto err; + if (!BN_rshift(d,b,j+ex)) goto err; + d->neg=0; + if (!BN_mul(b,&(recp->N),d,ctx)) goto err; + if (!BN_usub(r,m,b)) goto err; + r->neg=0; + j=0; - while (BN_cmp(r,m) >= 0) +#if 1 + while (BN_ucmp(r,&(recp->N)) >= 0) { if (j++ > 2) { BNerr(BN_F_BN_MOD_MUL_RECIPROCAL,BN_R_BAD_RECIPROCAL); goto err; } - if (!BN_sub(r,r,m)) goto err; + if (!BN_usub(r,r,&(recp->N))) goto err; + if (!BN_add_word(d,1)) goto err; } +#endif + r->neg=BN_is_zero(r)?0:m->neg; + d->neg=m->neg^recp->N.neg; ret=1; err: - ctx->tos-=4; + ctx->tos=tos; return(ret); - } + } -int BN_reciprocal(r, m,ctx) +/* len is the expected size of the result + * We actually calculate with an extra word of precision, so + * we can do faster division if the remainder is not required. + */ +int BN_reciprocal(r,m,len,ctx) BIGNUM *r; BIGNUM *m; +int len; BN_CTX *ctx; { - int nm,ret= -1; - BIGNUM *t; + int ret= -1; + BIGNUM t; - t=ctx->bn[ctx->tos++]; + BN_init(&t); - nm=BN_num_bits(m); - if (!BN_lshift(t,BN_value_one(),nm*2)) goto err; + BN_zero(&t); + if (!BN_set_bit(&t,len)) goto err; - if (!BN_div(r,NULL,t,m,ctx)) goto err; - ret=nm; + if (!BN_div(r,NULL,&t,m,ctx)) goto err; + ret=len; err: - ctx->tos--; + BN_free(&t); return(ret); } diff --git a/crypto/bn/bn_sqr.c b/crypto/bn/bn_sqr.c index a8464610e5..19ec0ddf84 100644 --- a/crypto/bn/bn_sqr.c +++ b/crypto/bn/bn_sqr.c @@ -67,30 +67,84 @@ BIGNUM *r; BIGNUM *a; BN_CTX *ctx; { - int i,j,max,al; + int max,al; BIGNUM *tmp; - BN_ULONG *ap,*rp; - tmp=ctx->bn[ctx->tos]; +#ifdef BN_COUNT +printf("BN_sqr %d * %d\n",a->top,a->top); +#endif + bn_check_top(a); + tmp= &(ctx->bn[ctx->tos]); al=a->top; - if (al == 0) + if (al <= 0) { r->top=0; return(1); } - max=(al*2); - if (bn_wexpand(r,1+max) == NULL) return(0); - if (bn_wexpand(tmp,1+max) == NULL) return(0); + max=(al+al); + if (bn_wexpand(r,max+1) == NULL) return(0); r->neg=0; + if (al == 4) + { +#ifndef BN_SQR_COMBA + BN_ULONG t[8]; + bn_sqr_normal(r->d,a->d,4,t); +#else + bn_sqr_comba4(r->d,a->d); +#endif + } + else if (al == 8) + { +#ifndef BN_SQR_COMBA + BN_ULONG t[16]; + bn_sqr_normal(r->d,a->d,8,t); +#else + bn_sqr_comba8(r->d,a->d); +#endif + } + else + { +#if defined(BN_RECURSION) + if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) + { + BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL*2]; + bn_sqr_normal(r->d,a->d,al,t); + } + else + { + if (bn_wexpand(tmp,2*max+1) == NULL) return(0); + bn_sqr_recursive(r->d,a->d,al,tmp->d); + } +#else + if (bn_wexpand(tmp,max) == NULL) return(0); + bn_sqr_normal(r->d,a->d,al,tmp->d); +#endif + } + + r->top=max; + if ((max > 0) && (r->d[max-1] == 0)) r->top--; + return(1); + } + +/* tmp must have 2*n words */ +void bn_sqr_normal(r, a, n, tmp) +BN_ULONG *r; +BN_ULONG *a; +int n; +BN_ULONG *tmp; + { + int i,j,max; + BN_ULONG *ap,*rp; - ap=a->d; - rp=r->d; + max=n*2; + ap=a; + rp=r; rp[0]=rp[max-1]=0; rp++; - j=al; + j=n; if (--j > 0) { @@ -99,7 +153,7 @@ BN_CTX *ctx; rp+=2; } - for (i=2; i<al; i++) + for (i=n-2; i>0; i--) { j--; ap++; @@ -107,16 +161,115 @@ BN_CTX *ctx; rp+=2; } - bn_add_words(r->d,r->d,r->d,max); + bn_add_words(r,r,r,max); /* There will not be a carry */ - bn_sqr_words(tmp->d,a->d,al); + bn_sqr_words(tmp,a,n); - bn_add_words(r->d,r->d,tmp->d,max); - - r->top=max; - if (r->d[max-1] == 0) r->top--; - return(1); + bn_add_words(r,r,tmp,max); } +#ifdef BN_RECURSION +/* r is 2*n words in size, + * a and b are both n words in size. + * n must be a power of 2. + * We multiply and return the result. + * t must be 2*n words in size + * We calulate + * a[0]*b[0] + * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0]) + * a[1]*b[1] + */ +void bn_sqr_recursive(r,a,n2,t) +BN_ULONG *r,*a; +int n2; +BN_ULONG *t; + { + int n=n2/2; + int zero,c1; + BN_ULONG ln,lo,*p; + +#ifdef BN_COUNT +printf(" bn_sqr_recursive %d * %d\n",n2,n2); +#endif + if (n2 == 4) + { +#ifndef BN_SQR_COMBA + bn_sqr_normal(r,a,4,t); +#else + bn_sqr_comba4(r,a); +#endif + return; + } + else if (n2 == 8) + { +#ifndef BN_SQR_COMBA + bn_sqr_normal(r,a,8,t); +#else + bn_sqr_comba8(r,a); +#endif + return; + } + if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) + { + bn_sqr_normal(r,a,n2,t); + return; + } + /* r=(a[0]-a[1])*(a[1]-a[0]) */ + c1=bn_cmp_words(a,&(a[n]),n); + zero=0; + if (c1 > 0) + bn_sub_words(t,a,&(a[n]),n); + else if (c1 < 0) + bn_sub_words(t,&(a[n]),a,n); + else + zero=1; + + /* The result will always be negative unless it is zero */ + p= &(t[n2*2]); + + if (!zero) + bn_sqr_recursive(&(t[n2]),t,n,p); + else + memset(&(t[n2]),0,n*sizeof(BN_ULONG)); + bn_sqr_recursive(r,a,n,p); + bn_sqr_recursive(&(r[n2]),&(a[n]),n,p); + + /* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + */ + + c1=bn_add_words(t,r,&(r[n2]),n2); + + /* t[32] is negative */ + c1-=bn_sub_words(&(t[n2]),t,&(t[n2]),n2); + + /* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1]) + * r[10] holds (a[0]*a[0]) + * r[32] holds (a[1]*a[1]) + * c1 holds the carry bits + */ + c1+=bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2); + if (c1) + { + p= &(r[n+n2]); + lo= *p; + ln=(lo+c1)&BN_MASK2; + *p=ln; + + /* The overflow will stop before we over write + * words we should not overwrite */ + if (ln < (BN_ULONG)c1) + { + do { + p++; + lo= *p; + ln=(lo+1)&BN_MASK2; + *p=ln; + } while (ln == 0); + } + } + } +#endif diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c index 4b3d0f011d..9c168e4f48 100644 --- a/crypto/bn/bn_word.c +++ b/crypto/bn/bn_word.c @@ -62,7 +62,7 @@ BN_ULONG BN_mod_word(a, w) BIGNUM *a; -unsigned long w; +BN_ULONG w; { #ifndef BN_LLONG BN_ULONG ret=0; @@ -75,8 +75,8 @@ unsigned long w; for (i=a->top-1; i>=0; i--) { #ifndef BN_LLONG - ret=((ret<<BN_BITS4)|((a->d[i]>>BN_BITS4)&BN_MASK2l))%(unsigned long)w; - ret=((ret<<BN_BITS4)|(a->d[i]&BN_MASK2l))%(unsigned long)w; + ret=((ret<<BN_BITS4)|((a->d[i]>>BN_BITS4)&BN_MASK2l))%w; + ret=((ret<<BN_BITS4)|(a->d[i]&BN_MASK2l))%w; #else ret=(BN_ULLONG)(((ret<<(BN_ULLONG)BN_BITS2)|a->d[i])% (BN_ULLONG)w); @@ -87,7 +87,7 @@ unsigned long w; BN_ULONG BN_div_word(a, w) BIGNUM *a; -unsigned long w; +BN_ULONG w; { BN_ULONG ret; int i; @@ -100,18 +100,18 @@ unsigned long w; BN_ULONG l,d; l=a->d[i]; - d=bn_div64(ret,l,w); + d=bn_div_words(ret,l,w); ret=(l-((d*w)&BN_MASK2))&BN_MASK2; a->d[i]=d; } - if (a->d[a->top-1] == 0) + if ((a->top > 0) && (a->d[a->top-1] == 0)) a->top--; return(ret); } int BN_add_word(a, w) BIGNUM *a; -unsigned long w; +BN_ULONG w; { BN_ULONG l; int i; @@ -144,7 +144,7 @@ unsigned long w; int BN_sub_word(a, w) BIGNUM *a; -unsigned long w; +BN_ULONG w; { int i; @@ -185,7 +185,7 @@ unsigned long w; int BN_mul_word(a,w) BIGNUM *a; -unsigned long w; +BN_ULONG w; { BN_ULONG ll; @@ -199,6 +199,6 @@ unsigned long w; a->d[a->top++]=ll; } } - return(0); + return(1); } diff --git a/crypto/bn/bnspeed.c b/crypto/bn/bnspeed.c index f7c2790fff..777212c1ba 100644 --- a/crypto/bn/bnspeed.c +++ b/crypto/bn/bnspeed.c @@ -94,7 +94,8 @@ struct tms { #include <sys/timeb.h> #endif -#ifdef sun +#if defined(sun) || defined(__ultrix) +#define _POSIX_SOURCE #include <limits.h> #include <sys/param.h> #endif @@ -180,15 +181,14 @@ int argc; char **argv; { BN_CTX *ctx; - BIGNUM *a,*b,*c,*r; + BIGNUM a,b,c; ctx=BN_CTX_new(); - a=BN_new(); - b=BN_new(); - c=BN_new(); - r=BN_new(); + BN_init(&a); + BN_init(&b); + BN_init(&c); - do_mul(a,b,c,ctx); + do_mul(&a,&b,&c,ctx); } void do_mul(r,a,b,ctx) @@ -211,7 +211,7 @@ BN_CTX *ctx; BN_rand(b,sizes[j],1,0); Time_F(START); for (k=0; k<num; k++) - BN_mul(r,b,a); + BN_mul(r,b,a,ctx); tm=Time_F(STOP); printf("mul %4d x %4d -> %8.3fms\n",sizes[i],sizes[j],tm*1000.0/num); } diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c index 9ebd68b429..ec48bad738 100644 --- a/crypto/bn/bntest.c +++ b/crypto/bn/bntest.c @@ -71,19 +71,20 @@ #endif #ifndef NOPROTO -int test_add (BIO *bp); -int test_sub (BIO *bp); -int test_lshift1 (BIO *bp); -int test_lshift (BIO *bp); -int test_rshift1 (BIO *bp); -int test_rshift (BIO *bp); -int test_div (BIO *bp,BN_CTX *ctx); -int test_mul (BIO *bp); -int test_sqr (BIO *bp,BN_CTX *ctx); -int test_mont (BIO *bp,BN_CTX *ctx); -int test_mod (BIO *bp,BN_CTX *ctx); -int test_mod_mul (BIO *bp,BN_CTX *ctx); -int test_mod_exp (BIO *bp,BN_CTX *ctx); +int test_add(BIO *bp); +int test_sub(BIO *bp); +int test_lshift1(BIO *bp); +int test_lshift(BIO *bp); +int test_rshift1(BIO *bp); +int test_rshift(BIO *bp); +int test_div(BIO *bp,BN_CTX *ctx); +int test_div_recp(BIO *bp,BN_CTX *ctx); +int test_mul(BIO *bp); +int test_sqr(BIO *bp,BN_CTX *ctx); +int test_mont(BIO *bp,BN_CTX *ctx); +int test_mod(BIO *bp,BN_CTX *ctx); +int test_mod_mul(BIO *bp,BN_CTX *ctx); +int test_mod_exp(BIO *bp,BN_CTX *ctx); int rand_neg(void); #else int test_add (); @@ -192,6 +193,10 @@ char *argv[]; if (!test_div(out,ctx)) goto err; fflush(stdout); + fprintf(stderr,"test BN_div_recp\n"); + if (!test_div_recp(out,ctx)) goto err; + fflush(stdout); + fprintf(stderr,"test BN_mod\n"); if (!test_mod(out,ctx)) goto err; fflush(stdout); @@ -221,80 +226,80 @@ err: int test_add(bp) BIO *bp; { - BIGNUM *a,*b,*c; + BIGNUM a,b,c; int i; int j; - a=BN_new(); - b=BN_new(); - c=BN_new(); + BN_init(&a); + BN_init(&b); + BN_init(&c); - BN_rand(a,512,0,0); + BN_rand(&a,512,0,0); for (i=0; i<100; i++) { - BN_rand(b,450+i,0,0); - a->neg=rand_neg(); - b->neg=rand_neg(); + BN_rand(&b,450+i,0,0); + a.neg=rand_neg(); + b.neg=rand_neg(); if (bp == NULL) for (j=0; j<10000; j++) - BN_add(c,a,b); - BN_add(c,a,b); + BN_add(&c,&a,&b); + BN_add(&c,&a,&b); if (bp != NULL) { if (!results) { - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," + "); - BN_print(bp,b); + BN_print(bp,&b); BIO_puts(bp," - "); } - BN_print(bp,c); + BN_print(bp,&c); BIO_puts(bp,"\n"); } } - BN_free(a); - BN_free(b); - BN_free(c); + BN_free(&a); + BN_free(&b); + BN_free(&c); return(1); } int test_sub(bp) BIO *bp; { - BIGNUM *a,*b,*c; + BIGNUM a,b,c; int i; int j; - a=BN_new(); - b=BN_new(); - c=BN_new(); + BN_init(&a); + BN_init(&b); + BN_init(&c); - BN_rand(a,512,0,0); + BN_rand(&a,512,0,0); for (i=0; i<100; i++) { - BN_rand(b,400+i,0,0); - a->neg=rand_neg(); - b->neg=rand_neg(); + BN_rand(&b,400+i,0,0); + a.neg=rand_neg(); + b.neg=rand_neg(); if (bp == NULL) for (j=0; j<10000; j++) - BN_sub(c,a,b); - BN_sub(c,a,b); + BN_sub(&c,&a,&b); + BN_sub(&c,&a,&b); if (bp != NULL) { if (!results) { - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," - "); - BN_print(bp,b); + BN_print(bp,&b); BIO_puts(bp," - "); } - BN_print(bp,c); + BN_print(bp,&c); BIO_puts(bp,"\n"); } } - BN_free(a); - BN_free(b); - BN_free(c); + BN_free(&a); + BN_free(&b); + BN_free(&c); return(1); } @@ -302,92 +307,154 @@ int test_div(bp,ctx) BIO *bp; BN_CTX *ctx; { - BIGNUM *a,*b,*c,*d; + BIGNUM a,b,c,d; int i; int j; - a=BN_new(); - b=BN_new(); - c=BN_new(); - d=BN_new(); + BN_init(&a); + BN_init(&b); + BN_init(&c); + BN_init(&d); - BN_rand(a,400,0,0); + BN_rand(&a,400,0,0); for (i=0; i<100; i++) { - BN_rand(b,50+i,0,0); - a->neg=rand_neg(); - b->neg=rand_neg(); + BN_rand(&b,50+i,0,0); + a.neg=rand_neg(); + b.neg=rand_neg(); if (bp == NULL) for (j=0; j<100; j++) - BN_div(d,c,a,b,ctx); - BN_div(d,c,a,b,ctx); + BN_div(&d,&c,&a,&b,ctx); + BN_div(&d,&c,&a,&b,ctx); if (bp != NULL) { if (!results) { - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," / "); - BN_print(bp,b); + BN_print(bp,&b); BIO_puts(bp," - "); } - BN_print(bp,d); + BN_print(bp,&d); BIO_puts(bp,"\n"); if (!results) { - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," % "); - BN_print(bp,b); + BN_print(bp,&b); BIO_puts(bp," - "); } - BN_print(bp,c); + BN_print(bp,&c); BIO_puts(bp,"\n"); } } - BN_free(a); - BN_free(b); - BN_free(c); - BN_free(d); + BN_free(&a); + BN_free(&b); + BN_free(&c); + BN_free(&d); + return(1); + } + +int test_div_recp(bp,ctx) +BIO *bp; +BN_CTX *ctx; + { + BIGNUM a,b,c,d; + BN_RECP_CTX recp; + int i; + int j; + + BN_RECP_CTX_init(&recp); + BN_init(&a); + BN_init(&b); + BN_init(&c); + BN_init(&d); + + BN_rand(&a,400,0,0); + for (i=0; i<100; i++) + { + BN_rand(&b,50+i,0,0); + a.neg=rand_neg(); + b.neg=rand_neg(); + BN_RECP_CTX_set(&recp,&b,ctx); + if (bp == NULL) + for (j=0; j<100; j++) + BN_div_recp(&d,&c,&a,&recp,ctx); + BN_div_recp(&d,&c,&a,&recp,ctx); + if (bp != NULL) + { + if (!results) + { + BN_print(bp,&a); + BIO_puts(bp," / "); + BN_print(bp,&b); + BIO_puts(bp," - "); + } + BN_print(bp,&d); + BIO_puts(bp,"\n"); + + if (!results) + { + BN_print(bp,&a); + BIO_puts(bp," % "); + BN_print(bp,&b); + BIO_puts(bp," - "); + } + BN_print(bp,&c); + BIO_puts(bp,"\n"); + } + } + BN_free(&a); + BN_free(&b); + BN_free(&c); + BN_free(&d); + BN_RECP_CTX_free(&recp); return(1); } int test_mul(bp) BIO *bp; { - BIGNUM *a,*b,*c; + BIGNUM a,b,c; int i; int j; + BN_CTX ctx; - a=BN_new(); - b=BN_new(); - c=BN_new(); + BN_CTX_init(&ctx); + BN_init(&a); + BN_init(&b); + BN_init(&c); - BN_rand(a,200,0,0); + BN_rand(&a,200,0,0); for (i=0; i<100; i++) { - BN_rand(b,250+i,0,0); - a->neg=rand_neg(); - b->neg=rand_neg(); + BN_rand(&b,250+i,0,0); + BN_rand(&b,200,0,0); + a.neg=rand_neg(); + b.neg=rand_neg(); if (bp == NULL) for (j=0; j<100; j++) - BN_mul(c,a,b); - BN_mul(c,a,b); + BN_mul(&c,&a,&b,&ctx); + BN_mul(&c,&a,&b,&ctx); +/*bn_do(&c,&a,&b,ctx); */ if (bp != NULL) { if (!results) { - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," * "); - BN_print(bp,b); + BN_print(bp,&b); BIO_puts(bp," - "); } - BN_print(bp,c); + BN_print(bp,&c); BIO_puts(bp,"\n"); } } - BN_free(a); - BN_free(b); - BN_free(c); + BN_free(&a); + BN_free(&b); + BN_free(&c); + BN_CTX_free(&ctx); return(1); } @@ -395,36 +462,36 @@ int test_sqr(bp,ctx) BIO *bp; BN_CTX *ctx; { - BIGNUM *a,*c; + BIGNUM a,c; int i; int j; - a=BN_new(); - c=BN_new(); + BN_init(&a); + BN_init(&c); for (i=0; i<40; i++) { - BN_rand(a,40+i*10,0,0); - a->neg=rand_neg(); + BN_rand(&a,40+i*10,0,0); + a.neg=rand_neg(); if (bp == NULL) for (j=0; j<100; j++) - BN_sqr(c,a,ctx); - BN_sqr(c,a,ctx); + BN_sqr(&c,&a,ctx); + BN_sqr(&c,&a,ctx); if (bp != NULL) { if (!results) { - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," * "); - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," - "); } - BN_print(bp,c); + BN_print(bp,&c); BIO_puts(bp,"\n"); } } - BN_free(a); - BN_free(c); + BN_free(&a); + BN_free(&c); return(1); } @@ -432,61 +499,61 @@ int test_mont(bp,ctx) BIO *bp; BN_CTX *ctx; { - BIGNUM *a,*b,*c,*A,*B; - BIGNUM *n; + BIGNUM a,b,c,A,B; + BIGNUM n; int i; int j; BN_MONT_CTX *mont; - a=BN_new(); - b=BN_new(); - c=BN_new(); - A=BN_new(); - B=BN_new(); - n=BN_new(); + BN_init(&a); + BN_init(&b); + BN_init(&c); + BN_init(&A); + BN_init(&B); + BN_init(&n); mont=BN_MONT_CTX_new(); - BN_rand(a,100,0,0); /**/ - BN_rand(b,100,0,0); /**/ + BN_rand(&a,100,0,0); /**/ + BN_rand(&b,100,0,0); /**/ for (i=0; i<10; i++) { - BN_rand(n,(100%BN_BITS2+1)*BN_BITS2*i*BN_BITS2,0,1); /**/ - BN_MONT_CTX_set(mont,n,ctx); + BN_rand(&n,(100%BN_BITS2+1)*BN_BITS2*i*BN_BITS2,0,1); /**/ + BN_MONT_CTX_set(mont,&n,ctx); - BN_to_montgomery(A,a,mont,ctx); - BN_to_montgomery(B,b,mont,ctx); + BN_to_montgomery(&A,&a,mont,ctx); + BN_to_montgomery(&B,&b,mont,ctx); if (bp == NULL) for (j=0; j<100; j++) - BN_mod_mul_montgomery(c,A,B,mont,ctx);/**/ - BN_mod_mul_montgomery(c,A,B,mont,ctx);/**/ - BN_from_montgomery(A,c,mont,ctx);/**/ + BN_mod_mul_montgomery(&c,&A,&B,mont,ctx);/**/ + BN_mod_mul_montgomery(&c,&A,&B,mont,ctx);/**/ + BN_from_montgomery(&A,&c,mont,ctx);/**/ if (bp != NULL) { if (!results) { #ifdef undef fprintf(stderr,"%d * %d %% %d\n", -BN_num_bits(a), -BN_num_bits(b), +BN_num_bits(&a), +BN_num_bits(&b), BN_num_bits(mont->N)); #endif - BN_print(bp,a); + BN_print(bp,&a); BIO_puts(bp," * "); - BN_print(bp,b); + BN_print(bp,&b); BIO_puts(bp," % "); - BN_print(bp,mont->N); + BN_print(bp,&(mont->N)); BIO_puts(bp," - "); } - BN_print(bp,A); + BN_print(bp,&A); BIO_puts(bp,"\n"); } } BN_MONT_CTX_free(mont); - BN_free(a); - BN_free(b); - BN_free(c); + BN_free(&a); + BN_free(&b); + BN_free(&c); return(1); } diff --git a/crypto/bn/comba.pl b/crypto/bn/comba.pl new file mode 100644 index 0000000000..211a8b45c7 --- /dev/null +++ b/crypto/bn/comba.pl @@ -0,0 +1,285 @@ +#!/usr/local/bin/perl + +$num=8; +$num2=8/2; + +print <<"EOF"; +/* crypto/bn/bn_comba.c */ +#include <stdio.h> +#include "bn_lcl.h" +/* Auto generated from crypto/bn/comba.pl + */ + +#undef bn_mul_comba8 +#undef bn_mul_comba4 +#undef bn_sqr_comba8 +#undef bn_sqr_comba4 + +#ifdef BN_LLONG +#define mul_add_c(a,b,c0,c1,c2) \\ + t=(BN_ULLONG)a*b; \\ + t1=(BN_ULONG)Lw(t); \\ + t2=(BN_ULONG)Hw(t); \\ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \\ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define mul_add_c2(a,b,c0,c1,c2) \\ + t=(BN_ULLONG)a*b; \\ + tt=(t+t)&BN_MASK; \\ + if (tt < t) c2++; \\ + t1=(BN_ULONG)Lw(tt); \\ + t2=(BN_ULONG)Hw(tt); \\ + c0=(c0+t1)&BN_MASK2; \\ + if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \\ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c(a,i,c0,c1,c2) \\ + t=(BN_ULLONG)a[i]*a[i]; \\ + t1=(BN_ULONG)Lw(t); \\ + t2=(BN_ULONG)Hw(t); \\ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \\ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c2(a,i,j,c0,c1,c2) \\ + mul_add_c2((a)[i],(a)[j],c0,c1,c2) +#else +#define mul_add_c(a,b,c0,c1,c2) \\ + t1=LBITS(a); t2=HBITS(a); \\ + bl=LBITS(b); bh=HBITS(b); \\ + mul64(t1,t2,bl,bh); \\ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \\ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define mul_add_c2(a,b,c0,c1,c2) \\ + t1=LBITS(a); t2=HBITS(a); \\ + bl=LBITS(b); bh=HBITS(b); \\ + mul64(t1,t2,bl,bh); \\ + if (t2 & BN_TBIT) c2++; \\ + t2=(t2+t2)&BN_MASK2; \\ + if (t1 & BN_TBIT) t2++; \\ + t1=(t1+t1)&BN_MASK2; \\ + c0=(c0+t1)&BN_MASK2; \\ + if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \\ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c(a,i,c0,c1,c2) \\ + sqr64(t1,t2,(a)[i]); \\ + c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \\ + c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; + +#define sqr_add_c2(a,i,j,c0,c1,c2) \\ + mul_add_c2((a)[i],(a)[j],c0,c1,c2) +#endif + +void bn_mul_comba${num}(r,a,b) +BN_ULONG *r,*a,*b; + { +#ifdef BN_LLONG + BN_ULLONG t; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + +EOF +$ret=&combas_mul("r","a","b",$num,"c1","c2","c3"); +printf <<"EOF"; + } + +void bn_mul_comba${num2}(r,a,b) +BN_ULONG *r,*a,*b; + { +#ifdef BN_LLONG + BN_ULLONG t; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + +EOF +$ret=&combas_mul("r","a","b",$num2,"c1","c2","c3"); +printf <<"EOF"; + } + +void bn_sqr_comba${num}(r,a) +BN_ULONG *r,*a; + { +#ifdef BN_LLONG + BN_ULLONG t,tt; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + +EOF +$ret=&combas_sqr("r","a",$num,"c1","c2","c3"); +printf <<"EOF"; + } + +void bn_sqr_comba${num2}(r,a) +BN_ULONG *r,*a; + { +#ifdef BN_LLONG + BN_ULLONG t,tt; +#else + BN_ULONG bl,bh; +#endif + BN_ULONG t1,t2; + BN_ULONG c1,c2,c3; + +EOF +$ret=&combas_sqr("r","a",$num2,"c1","c2","c3"); +printf <<"EOF"; + } +EOF + +sub bn_str + { + local($var,$val)=@_; + print "\t$var=$val;\n"; + } + +sub bn_ary + { + local($var,$idx)=@_; + return("${var}[$idx]"); + } + +sub bn_clr + { + local($var)=@_; + + print "\t$var=0;\n"; + } + +sub bn_mad + { + local($a,$b,$c0,$c1,$c2,$num)=@_; + + if ($num == 2) + { printf("\tmul_add_c2($a,$b,$c0,$c1,$c2);\n"); } + else + { printf("\tmul_add_c($a,$b,$c0,$c1,$c2);\n"); } + } + +sub bn_sad + { + local($a,$i,$j,$c0,$c1,$c2,$num)=@_; + + if ($num == 2) + { printf("\tsqr_add_c2($a,$i,$j,$c0,$c1,$c2);\n"); } + else + { printf("\tsqr_add_c($a,$i,$c0,$c1,$c2);\n"); } + } + +sub combas_mul + { + local($r,$a,$b,$num,$c0,$c1,$c2)=@_; + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($tot,$end); + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + &bn_clr($c0); + &bn_clr($c1); + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + @numa=@numb=(); + +#print "($as $ae) ($bs $be) $bs -> $end [$i $num]\n"; + for ($j=$bs; $j<$end; $j++) + { + push(@numa,$ai); + push(@numb,$bi); + $ai--; + $bi++; + } + + if ($i & 1) + { + @numa=reverse(@numa); + @numb=reverse(@numb); + } + + &bn_clr($c2); + for ($j=0; $j<=$#numa; $j++) + { + &bn_mad(&bn_ary($a,$numa[$j]), + &bn_ary($b,$numb[$j]),$c0,$c1,$c2,1); + } + &bn_str(&bn_ary($r,$i),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &bn_str(&bn_ary($r,$i),$c0); + } + +sub combas_sqr + { + local($r,$a,$num,$c0,$c1,$c2)=@_; + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($b,$tot,$end,$half); + + $b=$a; + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + &bn_clr($c0); + &bn_clr($c1); + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + @numa=@numb=(); + +#print "($as $ae) ($bs $be) $bs -> $end [$i $num]\n"; + for ($j=$bs; $j<$end; $j++) + { + push(@numa,$ai); + push(@numb,$bi); + $ai--; + $bi++; + last if ($ai < $bi); + } + if (!($i & 1)) + { + @numa=reverse(@numa); + @numb=reverse(@numb); + } + + &bn_clr($c2); + for ($j=0; $j <= $#numa; $j++) + { + if ($numa[$j] == $numb[$j]) + {&bn_sad($a,$numa[$j],$numb[$j],$c0,$c1,$c2,1);} + else + {&bn_sad($a,$numa[$j],$numb[$j],$c0,$c1,$c2,2);} + } + &bn_str(&bn_ary($r,$i),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &bn_str(&bn_ary($r,$i),$c0); + } diff --git a/crypto/bn/d.c b/crypto/bn/d.c new file mode 100644 index 0000000000..f738b5025e --- /dev/null +++ b/crypto/bn/d.c @@ -0,0 +1,72 @@ +#include <stdio.h> +#include "bio.h" +#include "bn_lcl.h" + +#define SIZE_A (100*4+4) +#define SIZE_B (13*4) + +main(argc,argv) +int argc; +char *argv[]; + { + BN_CTX ctx; + BN_RECP_CTX recp; + BIGNUM a,b,dd,d,r,rr,t,l; + int i; + + MemCheck_start(); + MemCheck_on(); + BN_CTX_init(&ctx); + BN_RECP_CTX_init(&recp); + + BN_init(&r); + BN_init(&rr); + BN_init(&d); + BN_init(&dd); + BN_init(&a); + BN_init(&b); + + { + BN_rand(&a,SIZE_A,0,0); + BN_rand(&b,SIZE_B,0,0); + + a.neg=1; + BN_RECP_CTX_set(&recp,&b,&ctx); + + BN_print_fp(stdout,&a); printf(" a\n"); + BN_print_fp(stdout,&b); printf(" b\n"); + + BN_print_fp(stdout,&recp.N); printf(" N\n"); + BN_print_fp(stdout,&recp.Nr); printf(" Nr num_bits=%d\n",recp.num_bits); + + BN_div_recp(&r,&d,&a,&recp,&ctx); + +for (i=0; i<300; i++) + BN_div(&rr,&dd,&a,&b,&ctx); + + BN_print_fp(stdout,&r); printf(" div recp\n"); + BN_print_fp(stdout,&rr); printf(" div\n"); + BN_print_fp(stdout,&d); printf(" rem recp\n"); + BN_print_fp(stdout,&dd); printf(" rem\n"); + } + BN_CTX_free(&ctx); + BN_RECP_CTX_free(&recp); + + BN_free(&r); + BN_free(&rr); + BN_free(&d); + BN_free(&dd); + BN_free(&a); + BN_free(&b); + + { + BIO *out; + + if ((out=BIO_new(BIO_s_file())) != NULL) + BIO_set_fp(out,stderr,BIO_NOCLOSE|BIO_FP_TEXT); + + CRYPTO_mem_leaks(out); + BIO_free(out); + } + + } diff --git a/crypto/bn/exp.c b/crypto/bn/exp.c new file mode 100644 index 0000000000..2427116564 --- /dev/null +++ b/crypto/bn/exp.c @@ -0,0 +1,60 @@ +#include <stdio.h> +#include "tmdiff.h" +#include "bn_lcl.h" + +#define SIZE 256 +#define NUM (8*8*8) +#define MOD (8*8*8*8*8) + +main(argc,argv) +int argc; +char *argv[]; + { + BN_CTX ctx; + BIGNUM a,b,c,r,rr,t,l; + int j,i,size=SIZE,num=NUM,mod=MOD; + char *start,*end; + BN_MONT_CTX mont; + double d,md; + + BN_MONT_CTX_init(&mont); + BN_CTX_init(&ctx); + BN_init(&a); + BN_init(&b); + BN_init(&c); + BN_init(&r); + + start=ms_time_new(); + end=ms_time_new(); + while (size <= 1024*8) + { + BN_rand(&a,size,0,0); + BN_rand(&b,size,1,0); + BN_rand(&c,size,0,1); + + BN_mod(&a,&a,&c,&ctx); + + ms_time_get(start); + for (i=0; i<10; i++) + BN_MONT_CTX_set(&mont,&c,&ctx); + ms_time_get(end); + md=ms_time_diff(start,end); + + ms_time_get(start); + for (i=0; i<num; i++) + { + //bn_mull(&r,&a,&b,&ctx); + //BN_sqr(&r,&a,&ctx); + BN_mod_exp_mont(&r,&a,&b,&c,&ctx,&mont); + } + ms_time_get(end); + d=ms_time_diff(start,end) *50/33 /**/; + printf("%5d bit:%6.2f %6d %6.4f %4d m_set(%5.4f)\n",size, + d,num,d/num,(int)((d/num)*mod),md/10.0); + num/=8; + mod/=8; + if (num <= 0) num=1; + size*=2; + } + + } diff --git a/crypto/bn/expspeed.c b/crypto/bn/expspeed.c index 344f883d35..fe00373246 100644 --- a/crypto/bn/expspeed.c +++ b/crypto/bn/expspeed.c @@ -94,7 +94,8 @@ struct tms { #include <sys/timeb.h> #endif -#ifdef sun +#if defined(sun) || defined(__ultrix) +#define _POSIX_SOURCE #include <limits.h> #include <sys/param.h> #endif diff --git a/crypto/bn/exptest.c b/crypto/bn/exptest.c index 67dc95d726..1ec61c2c87 100644 --- a/crypto/bn/exptest.c +++ b/crypto/bn/exptest.c @@ -79,6 +79,8 @@ char *argv[]; unsigned char c; BIGNUM *r_mont,*r_recp,*a,*b,*m; + ERR_load_BN_strings(); + ctx=BN_CTX_new(); if (ctx == NULL) exit(1); r_mont=BN_new(); @@ -114,11 +116,19 @@ char *argv[]; ret=BN_mod_exp_mont(r_mont,a,b,m,ctx,NULL); if (ret <= 0) - { printf("BN_mod_exp_mont() problems\n"); exit(1); } + { + printf("BN_mod_exp_mont() problems\n"); + ERR_print_errors(out); + exit(1); + } ret=BN_mod_exp_recp(r_recp,a,b,m,ctx); if (ret <= 0) - { printf("BN_mod_exp_recp() problems\n"); exit(1); } + { + printf("BN_mod_exp_recp() problems\n"); + ERR_print_errors(out); + exit(1); + } if (BN_cmp(r_mont,r_recp) != 0) { @@ -137,6 +147,7 @@ char *argv[]; fflush(stdout); } } + CRYPTO_mem_leaks(out); printf(" done\n"); exit(0); err: diff --git a/crypto/bn/m.pl b/crypto/bn/m.pl new file mode 100644 index 0000000000..f69b036666 --- /dev/null +++ b/crypto/bn/m.pl @@ -0,0 +1,32 @@ +#!/usr/local/bin/perl + + +for ($i=0; $i<256; $i++) + { + for ($j=0; $j<256; $j++) + { + $a0=$i&0x0f; + $a1=($i>>4)&0x0f; + $b0=$j&0x0f; + $b1=($j>>4)&0x0f; + + $a0b0=$a0*$b0; + $a1b1=$a1*$b1; + + $a01=$a0-$a1; + $b10=$b1-$b0; + $a01b10=$a01*$b10; + + if ($a01b10 < 0) + { + $neg=1; + $a01b10= -$a01b10; + } + $t=($a0b0>>4)+($a0b0&0x0f)+($a1b1&0x0f); + if ($neg) + { $t-=($a01b10&0x0f); } + else { $t+=($a01b10&0x0f); } + printf("%02X %s%02X %02X\n",$a1b1,($neg)?"-":" ",$a01b10,$a0b0) + if ($t < 0) + } + } diff --git a/crypto/bn/new b/crypto/bn/new new file mode 100644 index 0000000000..285d506f19 --- /dev/null +++ b/crypto/bn/new @@ -0,0 +1,23 @@ +void BN_RECP_CTX_init(BN_RECP_CTX *recp); +BN_RECP_CTX *BN_RECP_CTX_new(); +void BN_RECP_CTX_free(BN_RECP_CTX *recp); +int BN_RECP_CTX_set(BN_RECP_CTX *recp,BIGNUM *div,BN_CTX *ctx); + +int BN_mod_exp_recp(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m, + BN_RECP_CTX *recp,BN_CTX *ctx); + +int BN_div_recp(BIGNUM *dv, BIGNUM *rem, BIGNUM *m, BIGNUM *d, + BN_RECP_CTX *recp, BN_CTX *ctx); +int BN_mod_recp(BIGNUM *rem, BIGNUM *m, BIGNUM *d, + BN_RECP_CTX *recp, BN_CTX *ctx); +int BN_mod_mul_recp(BIGNUM *ret,BIGNUM *a,BIGNUM *b,BIGNUM *m + +int BN_mod_exp_montgomery(BIGNUM *r, BIGNUM *a, BIGNUM *p, + BN_MONT_CTX *m_ctx,BN_CTX *ctx); +int BN_mod_exp2_montgomery(BIGNUM *r, BIGNUM *a1, BIGNUM *p1,BIGNUM *a2, + BIGNUM *p2,BN_MONT_CTX *m_ctx,BN_CTX *ctx); + + +bn_div64 -> bn_div_words + + diff --git a/crypto/bn/old/b_sqr.c b/crypto/bn/old/b_sqr.c new file mode 100644 index 0000000000..e1a61b8471 --- /dev/null +++ b/crypto/bn/old/b_sqr.c @@ -0,0 +1,205 @@ +/* crypto/bn/bn_mul.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +static int bn_mm(BIGNUM *m,BIGNUM *A,BIGNUM *B, BIGNUM *sk,BN_CTX *ctx); + +/* r must be different to a and b */ +/* int BN_mmul(r, a, b) */ +int BN_mul(r, a, b) +BIGNUM *r; +BIGNUM *a; +BIGNUM *b; + { + BN_ULONG *ap,*bp,*rp; + BIGNUM *sk; + int i,n,ret; + int max,al,bl; + BN_CTX ctx; + + bn_check_top(a); + bn_check_top(b); + + al=a->top; + bl=b->top; + if ((al == 0) || (bl == 0)) + { + r->top=0; + return(1); + } +#ifdef BN_MUL_DEBUG +printf("BN_mul(%d,%d)\n",a->top,b->top); +#endif + + if ( (bn_limit_bits > 0) && + (bl > bn_limit_num) && (al > bn_limit_num)) + { + n=(BN_num_bits_word(al|bl)-bn_limit_bits); + n*=2; + sk=(BIGNUM *)Malloc(sizeof(BIGNUM)*n); + memset(sk,0,sizeof(BIGNUM)*n); + memset(&ctx,0,sizeof(ctx)); + + ret=bn_mm(r,a,b,&(sk[0]),&ctx); + for (i=0; i<n; i+=2) + { + BN_clear_free(&sk[i]); + BN_clear_free(&sk[i+1]); + } + Free(sk); + return(ret); + } + + max=(al+bl); + if (bn_wexpand(r,max) == NULL) return(0); + r->top=max; + r->neg=a->neg^b->neg; + ap=a->d; + bp=b->d; + rp=r->d; + + rp[al]=bn_mul_words(rp,ap,al,*(bp++)); + rp++; + for (i=1; i<bl; i++) + { + rp[al]=bn_mul_add_words(rp,ap,al,*(bp++)); + rp++; + } + if ((max > 0) && (r->d[max-1] == 0)) r->top--; + return(1); + } + + +#define ahal (sk[0]) +#define blbh (sk[1]) + +/* r must be different to a and b */ +int bn_mm(m, A, B, sk,ctx) +BIGNUM *m,*A,*B; +BIGNUM *sk; +BN_CTX *ctx; + { + int n,num,sqr=0; + int an,bn; + BIGNUM ah,al,bh,bl; + + an=A->top; + bn=B->top; +#ifdef BN_MUL_DEBUG +printf("bn_mm(%d,%d)\n",A->top,B->top); +#endif + + if (A == B) sqr=1; + num=(an>bn)?an:bn; + n=(num+1)/2; + /* Are going to now chop things into 'num' word chunks. */ + + BN_init(&ah); + BN_init(&al); + BN_init(&bh); + BN_init(&bl); + + bn_set_low (&al,A,n); + bn_set_high(&ah,A,n); + bn_set_low (&bl,B,n); + bn_set_high(&bh,B,n); + + BN_sub(&ahal,&ah,&al); + BN_sub(&blbh,&bl,&bh); + + if (num <= (bn_limit_num+bn_limit_num)) + { + BN_mul(m,&ahal,&blbh); + if (sqr) + { + BN_sqr(&ahal,&al,ctx); + BN_sqr(&blbh,&ah,ctx); + } + else + { + BN_mul(&ahal,&al,&bl); + BN_mul(&blbh,&ah,&bh); + } + } + else + { + bn_mm(m,&ahal,&blbh,&(sk[2]),ctx); + bn_mm(&ahal,&al,&bl,&(sk[2]),ctx); + bn_mm(&blbh,&ah,&bh,&(sk[2]),ctx); + } + + BN_add(m,m,&ahal); + BN_add(m,m,&blbh); + + BN_lshift(m,m,n*BN_BITS2); + BN_lshift(&blbh,&blbh,n*BN_BITS2*2); + + BN_add(m,m,&ahal); + BN_add(m,m,&blbh); + + m->neg=A->neg^B->neg; + return(1); + } +#undef ahal (sk[0]) +#undef blbh (sk[1]) + +#include "bn_low.c" +#include "bn_high.c" diff --git a/crypto/bn/old/bn_com.c b/crypto/bn/old/bn_com.c new file mode 100644 index 0000000000..7666b2304c --- /dev/null +++ b/crypto/bn/old/bn_com.c @@ -0,0 +1,90 @@ +/* crypto/bn/bn_mulw.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +#ifdef BN_LLONG + +ab +12 + a2 b2 +a1 b1 + +abc +123 + a3 b3 c3 + a2 b2 c2 +a1 b1 c1 + +abcd +1234 + a4 b4 c4 d4 + a3 b3 c3 d3 + a2 b2 c2 d2 +a1 b1 c1 d1 + +abcde +01234 + a5 b5 c5 d5 e5 + a4 b4 c4 d4 e4 + a3 b3 c3 d3 e3 + a2 b2 c2 d2 e2 + a1 b1 c1 d1 e1 +a0 b0 c0 d0 e0 diff --git a/crypto/bn/old/bn_high.c b/crypto/bn/old/bn_high.c new file mode 100644 index 0000000000..90268fb31a --- /dev/null +++ b/crypto/bn/old/bn_high.c @@ -0,0 +1,137 @@ +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +#undef BN_MUL_HIGH_DEBUG + +#ifdef BN_MUL_HIGH_DEBUG +#define debug_BN_print(a,b,c) BN_print_fp(a,b); printf(c); +#else +#define debug_BN_print(a,b,c) +#endif + +int BN_mul_high(BIGNUM *r,BIGNUM *a,BIGNUM *b,BIGNUM *low, int words); + +#undef t1 +#undef t2 + +int BN_mul_high(r,a,b,low,words) +BIGNUM *r,*a,*b,*low; +int words; + { + int w2,borrow=0,full=0; + BIGNUM t1,t2,t3,h,ah,al,bh,bl,m,s0,s1; + BN_ULONG ul1,ul2; + + BN_mul(r,a,b); + BN_rshift(r,r,words*BN_BITS2); + return(1); + + w2=(words+1)/2; + +#ifdef BN_MUL_HIGH_DEBUG +fprintf(stdout,"words=%d w2=%d\n",words,w2); +#endif +debug_BN_print(stdout,a," a\n"); +debug_BN_print(stdout,b," b\n"); +debug_BN_print(stdout,low," low\n"); + BN_init(&al); BN_init(&ah); + BN_init(&bl); BN_init(&bh); + BN_init(&t1); BN_init(&t2); BN_init(&t3); + BN_init(&s0); BN_init(&s1); + BN_init(&h); BN_init(&m); + + bn_set_low (&al,a,w2); + bn_set_high(&ah,a,w2); + bn_set_low (&bl,b,w2); + bn_set_high(&bh,b,w2); + + bn_set_low(&s0,low,w2); + bn_set_high(&s1,low,w2); + +debug_BN_print(stdout,&al," al\n"); +debug_BN_print(stdout,&ah," ah\n"); +debug_BN_print(stdout,&bl," bl\n"); +debug_BN_print(stdout,&bh," bh\n"); +debug_BN_print(stdout,&s0," s0\n"); +debug_BN_print(stdout,&s1," s1\n"); + + /* Calculate (al-ah)*(bh-bl) */ + BN_sub(&t1,&al,&ah); + BN_sub(&t2,&bh,&bl); + BN_mul(&m,&t1,&t2); + + /* Calculate ah*bh */ + BN_mul(&h,&ah,&bh); + + /* s0 == low(al*bl) + * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl) + * We know s0 and s1 so the only unknown is high(al*bl) + * high(al*bl) == s1 - low(ah*bh+(al-ah)*(bh-bl)+s0) + */ + BN_add(&m,&m,&h); + BN_add(&t2,&m,&s0); + +debug_BN_print(stdout,&t2," middle value\n"); + + /* Quick and dirty mask off of high words */ + if (w2 < t2.top) t2.top=w2; +#if 0 + bn_set_low(&t3,&t2,w2); +#endif + +debug_BN_print(stdout,&t2," low middle value\n"); + BN_sub(&t1,&s1,&t2); + + if (t1.neg) + { +debug_BN_print(stdout,&t1," before\n"); + BN_zero(&t2); + BN_set_bit(&t2,w2*BN_BITS2); + BN_add(&t1,&t2,&t1); + /* BN_mask_bits(&t1,w2*BN_BITS2); */ + /* if (words < t1.top) t1.top=words; */ +debug_BN_print(stdout,&t1," after\n"); + borrow=1; + } + +/* XXXXX SPEED THIS UP */ + /* al*bl == high(al*bl)<<words+s0 */ + BN_lshift(&t1,&t1,w2*BN_BITS2); + BN_add(&t1,&t1,&s0); + if (w2*2 < t1.top) t1.top=w2*2; /* This should not happen? */ + + /* We now have + * al*bl - t1 + * (al-ah)*(bh-bl)+ah*bh - m + * ah*bh - h + */ +#if 0 + BN_add(&m,&m,&t1); +debug_BN_print(stdout,&t1," s10\n"); +debug_BN_print(stdout,&m," s21\n"); +debug_BN_print(stdout,&h," s32\n"); + BN_lshift(&m,&m,w2*BN_BITS2); + BN_lshift(&h,&h,w2*2*BN_BITS2); + BN_add(r,&m,&t1); + BN_add(r,r,&h); + BN_rshift(r,r,w2*2*BN_BITS2); +#else + BN_add(&m,&m,&t1); /* Do a cmp then +1 if needed? */ + bn_set_high(&t3,&t1,w2); + BN_add(&m,&m,&t3); + bn_set_high(&t3,&m,w2); + BN_add(r,&h,&t3); +#endif + +#ifdef BN_MUL_HIGH_DEBUG +printf("carry=%d\n",borrow); +#endif +debug_BN_print(stdout,r," ret\n"); + BN_free(&t1); BN_free(&t2); + BN_free(&m); BN_free(&h); + return(1); + } + + + diff --git a/crypto/bn/old/bn_ka.c b/crypto/bn/old/bn_ka.c new file mode 100644 index 0000000000..b49a52aa73 --- /dev/null +++ b/crypto/bn/old/bn_ka.c @@ -0,0 +1,578 @@ +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include "bn_lcl.h" + +/* r is 2*n2 words in size, + * a and b are both n2 words in size. + * n2 must be a power of 2. + * We multiply and return the result. + * t must be 2*n2 words in size + * We calulate + * a[0]*b[0] + * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0]) + * a[1]*b[1] + */ +void bn_mul_recursive(r,a,b,n2,t) +BN_ULONG *r,*a,*b; +int n2; +BN_ULONG *t; + { + int n=n2/2; + int neg,zero,c1,c2; + BN_ULONG ln,lo,*p; + +#ifdef BN_COUNT +printf(" bn_mul_recursive %d * %d\n",n2,n2); +#endif + if (n2 <= 8) + { + if (n2 == 8) + bn_mul_comba8(r,a,b); + else + bn_mul_normal(r,a,n2,b,n2); + return; + } + + if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) + { + /* This should not happen */ + /*abort(); */ + bn_mul_normal(r,a,n2,b,n2); + return; + } + /* r=(a[0]-a[1])*(b[1]-b[0]) */ + c1=bn_cmp_words(a,&(a[n]),n); + c2=bn_cmp_words(&(b[n]),b,n); + zero=neg=0; + switch (c1*3+c2) + { + case -4: + bn_sub_words(t, &(a[n]),a, n); /* - */ + bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ + break; + case -3: + zero=1; + break; + case -2: + bn_sub_words(t, &(a[n]),a, n); /* - */ + bn_sub_words(&(t[n]),&(b[n]),b, n); /* + */ + neg=1; + break; + case -1: + case 0: + case 1: + zero=1; + break; + case 2: + bn_sub_words(t, a, &(a[n]),n); /* + */ + bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ + neg=1; + break; + case 3: + zero=1; + break; + case 4: + bn_sub_words(t, a, &(a[n]),n); + bn_sub_words(&(t[n]),&(b[n]),b, n); + break; + } + + if (n == 8) + { + if (!zero) + bn_mul_comba8(&(t[n2]),t,&(t[n])); + else + memset(&(t[n2]),0,8*sizeof(BN_ULONG)); + + bn_mul_comba8(r,a,b); + bn_mul_comba8(&(r[n2]),&(a[n]),&(b[n])); + } + else + { + p= &(t[n2*2]); + if (!zero) + bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p); + else + memset(&(t[n2]),0,n*sizeof(BN_ULONG)); + bn_mul_recursive(r,a,b,n,p); + bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,p); + } + + /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + */ + + c1=bn_add_words(t,r,&(r[n2]),n2); + + if (neg) /* if t[32] is negative */ + { + c1-=bn_sub_words(&(t[n2]),t,&(t[n2]),n2); + } + else + { + /* Might have a carry */ + c1+=bn_add_words(&(t[n2]),&(t[n2]),t,n2); + } + + /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1]) + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + * c1 holds the carry bits + */ + c1+=bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2); + if (c1) + { + p= &(r[n+n2]); + lo= *p; + ln=(lo+c1)&BN_MASK2; + *p=ln; + + /* The overflow will stop before we over write + * words we should not overwrite */ + if (ln < c1) + { + do { + p++; + lo= *p; + ln=(lo+1)&BN_MASK2; + *p=ln; + } while (ln == 0); + } + } + } + +/* n+tn is the word length + * t needs to be n*4 is size, as does r */ +void bn_mul_part_recursive(r,a,b,tn,n,t) +BN_ULONG *r,*a,*b; +int tn,n; +BN_ULONG *t; + { + int n2=n*2,i,j; + int c1; + BN_ULONG ln,lo,*p; + +#ifdef BN_COUNT +printf(" bn_mul_part_recursive %d * %d\n",tn+n,tn+n); +#endif + if (n < 8) + { + i=tn+n; + bn_mul_normal(r,a,i,b,i); + return; + } + + /* r=(a[0]-a[1])*(b[1]-b[0]) */ + bn_sub_words(t, a, &(a[n]),n); /* + */ + bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ + + if (n == 8) + { + bn_mul_comba8(&(t[n2]),t,&(t[n])); + bn_mul_comba8(r,a,b); + bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn); + memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2)); + } + else + { + p= &(t[n2*2]); + bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p); + bn_mul_recursive(r,a,b,n,p); + i=n/2; + /* If there is only a bottom half to the number, + * just do it */ + j=tn-i; + if (j == 0) + { + bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),i,p); + memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2)); + } + else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */ + { + bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]), + j,i,p); + memset(&(r[n2+tn*2]),0, + sizeof(BN_ULONG)*(n2-tn*2)); + } + else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */ + { + memset(&(r[n2]),0,sizeof(BN_ULONG)*(tn*2)); + for (;;) + { + i/=2; + if (i < tn) + { + bn_mul_part_recursive(&(r[n2]), + &(a[n]),&(b[n]), + tn-i,i,p); + break; + } + else if (i == tn) + { + bn_mul_recursive(&(r[n2]), + &(a[n]),&(b[n]), + i,p); + break; + } + } + } + } + + /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + */ + + c1=bn_add_words(t,r,&(r[n2]),n2); + c1-=bn_sub_words(&(t[n2]),t,&(t[n2]),n2); + + /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1]) + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + * c1 holds the carry bits + */ + c1+=bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2); + if (c1) + { + p= &(r[n+n2]); + lo= *p; + ln=(lo+c1)&BN_MASK2; + *p=ln; + + /* The overflow will stop before we over write + * words we should not overwrite */ + if (ln < c1) + { + do { + p++; + lo= *p; + ln=(lo+1)&BN_MASK2; + *p=ln; + } while (ln == 0); + } + } + } + +/* r is 2*n words in size, + * a and b are both n words in size. + * n must be a power of 2. + * We multiply and return the result. + * t must be 2*n words in size + * We calulate + * a[0]*b[0] + * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0]) + * a[1]*b[1] + */ +void bn_sqr_recursive(r,a,n2,t) +BN_ULONG *r,*a; +int n2; +BN_ULONG *t; + { + int n=n2/2; + int zero,c1; + BN_ULONG ln,lo,*p; + +#ifdef BN_COUNT +printf(" bn_sqr_recursive %d * %d\n",n2,n2); +#endif + if (n2 == 4) + { + bn_sqr_comba4(r,a); + return; + } + else if (n2 == 8) + { + bn_sqr_comba8(r,a); + return; + } + if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) + { + bn_sqr_normal(r,a,n2,t); + return; + abort(); + } + /* r=(a[0]-a[1])*(a[1]-a[0]) */ + c1=bn_cmp_words(a,&(a[n]),n); + zero=0; + if (c1 > 0) + bn_sub_words(t,a,&(a[n]),n); + else if (c1 < 0) + bn_sub_words(t,&(a[n]),a,n); + else + zero=1; + + /* The result will always be negative unless it is zero */ + + if (n == 8) + { + if (!zero) + bn_sqr_comba8(&(t[n2]),t); + else + memset(&(t[n2]),0,8*sizeof(BN_ULONG)); + + bn_sqr_comba8(r,a); + bn_sqr_comba8(&(r[n2]),&(a[n])); + } + else + { + p= &(t[n2*2]); + if (!zero) + bn_sqr_recursive(&(t[n2]),t,n,p); + else + memset(&(t[n2]),0,n*sizeof(BN_ULONG)); + bn_sqr_recursive(r,a,n,p); + bn_sqr_recursive(&(r[n2]),&(a[n]),n,p); + } + + /* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero + * r[10] holds (a[0]*b[0]) + * r[32] holds (b[1]*b[1]) + */ + + c1=bn_add_words(t,r,&(r[n2]),n2); + + /* t[32] is negative */ + c1-=bn_sub_words(&(t[n2]),t,&(t[n2]),n2); + + /* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1]) + * r[10] holds (a[0]*a[0]) + * r[32] holds (a[1]*a[1]) + * c1 holds the carry bits + */ + c1+=bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2); + if (c1) + { + p= &(r[n+n2]); + lo= *p; + ln=(lo+c1)&BN_MASK2; + *p=ln; + + /* The overflow will stop before we over write + * words we should not overwrite */ + if (ln < c1) + { + do { + p++; + lo= *p; + ln=(lo+1)&BN_MASK2; + *p=ln; + } while (ln == 0); + } + } + } + +#if 1 +/* a and b must be the same size, which is n2. + * r needs to be n2 words and t needs to be n2*2 + */ +void bn_mul_low_recursive(r,a,b,n2,t) +BN_ULONG *r,*a,*b; +int n2; +BN_ULONG *t; + { + int n=n2/2; + +#ifdef BN_COUNT +printf(" bn_mul_low_recursive %d * %d\n",n2,n2); +#endif + + bn_mul_recursive(r,a,b,n,&(t[0])); + if (n > BN_MUL_LOW_RECURSIVE_SIZE_NORMAL) + { + bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2])); + bn_add_words(&(r[n]),&(r[n]),&(t[0]),n); + bn_mul_low_recursive(&(t[0]),&(a[n]),&(b[0]),n,&(t[n2])); + bn_add_words(&(r[n]),&(r[n]),&(t[0]),n); + } + else + { + bn_mul_low_normal(&(t[0]),&(a[0]),&(b[n]),n); + bn_mul_low_normal(&(t[n]),&(a[n]),&(b[0]),n); + bn_add_words(&(r[n]),&(r[n]),&(t[0]),n); + bn_add_words(&(r[n]),&(r[n]),&(t[n]),n); + } + } + +/* a and b must be the same size, which is n2. + * r needs to be n2 words and t needs to be n2*2 + * l is the low words of the output. + * t needs to be n2*3 + */ +void bn_mul_high(r,a,b,l,n2,t) +BN_ULONG *r,*a,*b,*l; +int n2; +BN_ULONG *t; + { + int j,i,n,c1,c2; + int neg,oneg,zero; + BN_ULONG ll,lc,*lp,*mp; + +#ifdef BN_COUNT +printf(" bn_mul_high %d * %d\n",n2,n2); +#endif + n=(n2+1)/2; + + /* Calculate (al-ah)*(bh-bl) */ + neg=zero=0; + c1=bn_cmp_words(&(a[0]),&(a[n]),n); + c2=bn_cmp_words(&(b[n]),&(b[0]),n); + switch (c1*3+c2) + { + case -4: + bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n); + bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n); + break; + case -3: + zero=1; + break; + case -2: + bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n); + bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n); + neg=1; + break; + case -1: + case 0: + case 1: + zero=1; + break; + case 2: + bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n); + bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n); + neg=1; + break; + case 3: + zero=1; + break; + case 4: + bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n); + bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n); + break; + } + + oneg=neg; + /* t[10] = (a[0]-a[1])*(b[1]-b[0]) */ + bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,&(t[n2])); + /* r[10] = (a[1]*b[1]) */ + bn_mul_recursive(r,&(a[n]),&(b[n]),n,&(t[n2])); + + /* s0 == low(al*bl) + * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl) + * We know s0 and s1 so the only unknown is high(al*bl) + * high(al*bl) == s1 - low(ah*bh+s0+(al-ah)*(bh-bl)) + * high(al*bl) == s1 - (r[0]+l[0]+t[0]) + */ + if (l != NULL) + { + lp= &(t[n2+n]); + c1=bn_add_words(lp,&(r[0]),&(l[0]),n); + } + else + { + c1=0; + lp= &(r[0]); + } + + if (neg) + neg=bn_sub_words(&(t[n2]),lp,&(t[0]),n); + else + { + bn_add_words(&(t[n2]),lp,&(t[0]),n); + neg=0; + } + + if (l != NULL) + { + bn_sub_words(&(t[n2+n]),&(l[n]),&(t[n2]),n); + } + else + { + lp= &(t[n2+n]); + mp= &(t[n2]); + for (i=0; i<n; i++) + lp[i]=((~mp[i])+1)&BN_MASK2; + } + + /* s[0] = low(al*bl) + * t[3] = high(al*bl) + * t[10] = (a[0]-a[1])*(b[1]-b[0]) neg is the sign + * r[10] = (a[1]*b[1]) + */ + /* R[10] = al*bl + * R[21] = al*bl + ah*bh + (a[0]-a[1])*(b[1]-b[0]) + * R[32] = ah*bh + */ + /* R[1]=t[3]+l[0]+r[0](+-)t[0] (have carry/borrow) + * R[2]=r[0]+t[3]+r[1](+-)t[1] (have carry/borrow) + * R[3]=r[1]+(carry/borrow) + */ + if (l != NULL) + { + lp= &(t[n2]); + c1= bn_add_words(lp,&(t[n2+n]),&(l[0]),n); + } + else + { + lp= &(t[n2+n]); + c1=0; + } + c1+=bn_add_words(&(t[n2]),lp, &(r[0]),n); + if (oneg) + c1-=bn_sub_words(&(t[n2]),&(t[n2]),&(t[0]),n); + else + c1+=bn_add_words(&(t[n2]),&(t[n2]),&(t[0]),n); + + c2 =bn_add_words(&(r[0]),&(r[0]),&(t[n2+n]),n); + c2+=bn_add_words(&(r[0]),&(r[0]),&(r[n]),n); + if (oneg) + c2-=bn_sub_words(&(r[0]),&(r[0]),&(t[n]),n); + else + c2+=bn_add_words(&(r[0]),&(r[0]),&(t[n]),n); + + if (c1 != 0) /* Add starting at r[0], could be +ve or -ve */ + { + i=0; + if (c1 > 0) + { + lc=c1; + do { + ll=(r[i]+lc)&BN_MASK2; + r[i++]=ll; + lc=(lc > ll); + } while (lc); + } + else + { + lc= -c1; + do { + ll=r[i]; + r[i++]=(ll-lc)&BN_MASK2; + lc=(lc > ll); + } while (lc); + } + } + if (c2 != 0) /* Add starting at r[1] */ + { + i=n; + if (c2 > 0) + { + lc=c2; + do { + ll=(r[i]+lc)&BN_MASK2; + r[i++]=ll; + lc=(lc > ll); + } while (lc); + } + else + { + lc= -c2; + do { + ll=r[i]; + r[i++]=(ll-lc)&BN_MASK2; + lc=(lc > ll); + } while (lc); + } + } + } +#endif diff --git a/crypto/bn/old/bn_low.c b/crypto/bn/old/bn_low.c new file mode 100644 index 0000000000..217c8c2f96 --- /dev/null +++ b/crypto/bn/old/bn_low.c @@ -0,0 +1,201 @@ +/* crypto/bn/bn_mul.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +static int bn_mm_low(BIGNUM *m,BIGNUM *A,BIGNUM *B, int num, + BIGNUM *sk,BN_CTX *ctx); +int BN_mul_low(BIGNUM *r, BIGNUM *a, BIGNUM *b,int words); + +/* r must be different to a and b */ +int BN_mul_low(r, a, b, num) +BIGNUM *r; +BIGNUM *a; +BIGNUM *b; +int num; + { + BN_ULONG *ap,*bp,*rp; + BIGNUM *sk; + int j,i,n,ret; + int max,al,bl; + BN_CTX ctx; + + bn_check_top(a); + bn_check_top(b); + +#ifdef BN_MUL_DEBUG +printf("BN_mul_low(%d,%d,%d)\n",a->top,b->top,num); +#endif + + al=a->top; + bl=b->top; + if ((al == 0) || (bl == 0)) + { + r->top=0; + return(1); + } + + if ((bn_limit_bits_low > 0) && (num > bn_limit_num_low)) + { + n=BN_num_bits_word(num*2)-bn_limit_bits_low; + n*=2; + sk=(BIGNUM *)Malloc(sizeof(BIGNUM)*n); + memset(sk,0,sizeof(BIGNUM)*n); + memset(&ctx,0,sizeof(ctx)); + + ret=bn_mm_low(r,a,b,num,&(sk[0]),&ctx); + for (i=0; i<n; i+=2) + { + BN_clear_free(&sk[i]); + BN_clear_free(&sk[i+1]); + } + Free(sk); + return(ret); + } + + max=(al+bl); + if (bn_wexpand(r,max) == NULL) return(0); + r->neg=a->neg^b->neg; + ap=a->d; + bp=b->d; + rp=r->d; + r->top=(max > num)?num:max; + + rp[al]=bn_mul_words(rp,ap,al,*(bp++)); + rp++; + j=bl; + for (i=1; i<j; i++) + { + if (al >= num--) + { + al--; + if (al <= 0) break; + } + rp[al]=bn_mul_add_words(rp,ap,al,*(bp++)); + rp++; + } + + while ((r->top > 0) && (r->d[r->top-1] == 0)) + r->top--; + return(1); + } + + +#define t1 (sk[0]) +#define t2 (sk[1]) + +/* r must be different to a and b */ +int bn_mm_low(m, A, B, num, sk,ctx) +BIGNUM *m,*A,*B; +int num; +BIGNUM *sk; +BN_CTX *ctx; + { + int n; /* ,sqr=0; */ + int an,bn; + BIGNUM ah,al,bh,bl; + + bn_wexpand(m,num+3); + an=A->top; + bn=B->top; + +#ifdef BN_MUL_DEBUG +printf("bn_mm_low(%d,%d,%d)\n",A->top,B->top,num); +#endif + + n=(num+1)/2; + + BN_init(&ah); BN_init(&al); BN_init(&bh); BN_init(&bl); + + bn_set_low( &al,A,n); + bn_set_high(&ah,A,n); + bn_set_low( &bl,B,n); + bn_set_high(&bh,B,n); + + if (num <= (bn_limit_num_low+bn_limit_num_low)) + { + BN_mul(m,&al,&bl); + BN_mul_low(&t1,&al,&bh,n); + BN_mul_low(&t2,&ah,&bl,n); + } + else + { + bn_mm(m ,&al,&bl,&(sk[2]),ctx); + bn_mm_low(&t1,&al,&bh,n,&(sk[2]),ctx); + bn_mm_low(&t2,&ah,&bl,n,&(sk[2]),ctx); + } + + BN_add(&t1,&t1,&t2); + + /* We will now do an evil hack instead of + * BN_lshift(&t1,&t1,n*BN_BITS2); + * BN_add(m,m,&t1); + * BN_mask_bits(m,num*BN_BITS2); + */ + bn_set_high(&ah,m,n); ah.max=num+2; + BN_add(&ah,&ah,&t1); + m->top=num; + + m->neg=A->neg^B->neg; + return(1); + } + +#undef t1 (sk[0]) +#undef t2 (sk[1]) diff --git a/crypto/bn/old/bn_m.c b/crypto/bn/old/bn_m.c new file mode 100644 index 0000000000..1cf51e8e2a --- /dev/null +++ b/crypto/bn/old/bn_m.c @@ -0,0 +1,142 @@ +/* crypto/bn/bn_m.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +/*#include "cryptlib.h"*/ +#include "bn_lcl.h" + +#define limit_bits 5 /* 2^5, or 32 words */ +#define limit_num (1<<limit_bits) + +int BN_m(r,a,b) +BIGNUM *r,*a,*b; + { + BIGNUM *sk; + int i,n; + + n=(BN_num_bits_word(a->top|b->top)-limit_bits); + n*=2; + sk=(BIGNUM *)malloc(sizeof(BIGNUM)*n); + for (i=0; i<n; i++) + BN_init(&(sk[i])); + + return(BN_mm(r,a,b,&(sk[0]))); + } + +#define ahal (sk[0]) +#define blbh (sk[1]) + +/* r must be different to a and b */ +int BN_mm(m, A, B, sk) +BIGNUM *m,*A,*B; +BIGNUM *sk; + { + int i,num,anum,bnum; + int an,bn; + BIGNUM ah,al,bh,bl; + + an=A->top; + bn=B->top; + if ((an <= limit_num) || (bn <= limit_num)) + { + return(BN_mul(m,A,B)); + } + + anum=(an>bn)?an:bn; + num=(anum)/2; + + /* Are going to now chop things into 'num' word chunks. */ + bnum=num*BN_BITS2; + + BN_init(&ahal); + BN_init(&blbh); + BN_init(&ah); + BN_init(&al); + BN_init(&bh); + BN_init(&bl); + + al.top=num; + al.d=A->d; + ah.top=A->top-num; + ah.d= &(A->d[num]); + + bl.top=num; + bl.d=B->d; + bh.top=B->top-num; + bh.d= &(B->d[num]); + + BN_sub(&ahal,&ah,&al); + BN_sub(&blbh,&bl,&bh); + + BN_mm(m,&ahal,&blbh,&(sk[2])); + BN_mm(&ahal,&al,&bl,&(sk[2])); + BN_mm(&blbh,&ah,&bh,&(sk[2])); + + BN_add(m,m,&ahal); + BN_add(m,m,&blbh); + + BN_lshift(m,m,bnum); + BN_add(m,m,&ahal); + + BN_lshift(&blbh,&blbh,bnum*2); + BN_add(m,m,&blbh); + + m->neg=A->neg^B->neg; + return(1); + } + diff --git a/crypto/bn/old/bn_mul.c.works b/crypto/bn/old/bn_mul.c.works new file mode 100644 index 0000000000..6d565d44a2 --- /dev/null +++ b/crypto/bn/old/bn_mul.c.works @@ -0,0 +1,219 @@ +/* crypto/bn/bn_mul.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +int bn_mm(BIGNUM *m,BIGNUM *A,BIGNUM *B, BIGNUM *sk,BN_CTX *ctx); + +/* r must be different to a and b */ +int BN_mul(r, a, b) +BIGNUM *r; +BIGNUM *a; +BIGNUM *b; + { + BN_ULONG *ap,*bp,*rp; + BIGNUM *sk; + int i,n,ret; + int max,al,bl; + BN_CTX ctx; + + bn_check_top(a); + bn_check_top(b); + + al=a->top; + bl=b->top; + if ((al == 0) || (bl == 0)) + { + r->top=0; + return(1); + } +#ifdef BN_MUL_DEBUG +printf("BN_mul(%d,%d)\n",a->top,b->top); +#endif + +#ifdef BN_RECURSION + if ( (bn_limit_bits > 0) && + (bl > bn_limit_num) && (al > bn_limit_num)) + { + n=(BN_num_bits_word(al|bl)-bn_limit_bits); + n*=2; + sk=(BIGNUM *)Malloc(sizeof(BIGNUM)*n); + memset(sk,0,sizeof(BIGNUM)*n); + memset(&ctx,0,sizeof(ctx)); + + ret=bn_mm(r,a,b,&(sk[0]),&ctx); + for (i=0; i<n; i+=2) + { + BN_clear_free(&sk[i]); + BN_clear_free(&sk[i+1]); + } + Free(sk); + return(ret); + } +#endif + + max=(al+bl); + if (bn_wexpand(r,max) == NULL) return(0); + r->top=max; + r->neg=a->neg^b->neg; + ap=a->d; + bp=b->d; + rp=r->d; + +#ifdef BN_RECURSION + if ((al == bl) && (al == 8)) + { + bn_mul_comba8(rp,ap,bp); + } + else +#endif + { + rp[al]=bn_mul_words(rp,ap,al,*(bp++)); + rp++; + for (i=1; i<bl; i++) + { + rp[al]=bn_mul_add_words(rp,ap,al,*(bp++)); + rp++; + } + } + if ((max > 0) && (r->d[max-1] == 0)) r->top--; + return(1); + } + +#ifdef BN_RECURSION + +#define ahal (sk[0]) +#define blbh (sk[1]) + +/* r must be different to a and b */ +int bn_mm(m, A, B, sk,ctx) +BIGNUM *m,*A,*B; +BIGNUM *sk; +BN_CTX *ctx; + { + int n,num,sqr=0; + int an,bn; + BIGNUM ah,al,bh,bl; + + an=A->top; + bn=B->top; +#ifdef BN_MUL_DEBUG +printf("bn_mm(%d,%d)\n",A->top,B->top); +#endif + + if (A == B) sqr=1; + num=(an>bn)?an:bn; + n=(num+1)/2; + /* Are going to now chop things into 'num' word chunks. */ + + BN_init(&ah); + BN_init(&al); + BN_init(&bh); + BN_init(&bl); + + bn_set_low (&al,A,n); + bn_set_high(&ah,A,n); + bn_set_low (&bl,B,n); + bn_set_high(&bh,B,n); + + BN_sub(&ahal,&ah,&al); + BN_sub(&blbh,&bl,&bh); + + if (num <= (bn_limit_num+bn_limit_num)) + { + BN_mul(m,&ahal,&blbh); + if (sqr) + { + BN_sqr(&ahal,&al,ctx); + BN_sqr(&blbh,&ah,ctx); + } + else + { + BN_mul(&ahal,&al,&bl); + BN_mul(&blbh,&ah,&bh); + } + } + else + { + bn_mm(m,&ahal,&blbh,&(sk[2]),ctx); + bn_mm(&ahal,&al,&bl,&(sk[2]),ctx); + bn_mm(&blbh,&ah,&bh,&(sk[2]),ctx); + } + + BN_add(m,m,&ahal); + BN_add(m,m,&blbh); + + BN_lshift(m,m,n*BN_BITS2); + BN_lshift(&blbh,&blbh,n*BN_BITS2*2); + + BN_add(m,m,&ahal); + BN_add(m,m,&blbh); + + m->neg=A->neg^B->neg; + return(1); + } +#undef ahal (sk[0]) +#undef blbh (sk[1]) + +#include "bn_low.c" +#include "bn_high.c" +#include "f.c" + +#endif diff --git a/crypto/bn/old/bn_wmul.c b/crypto/bn/old/bn_wmul.c new file mode 100644 index 0000000000..e3ce107921 --- /dev/null +++ b/crypto/bn/old/bn_wmul.c @@ -0,0 +1,181 @@ +#include <stdio.h> +#include "bn_lcl.h" + +#if 1 + +int bn_mull(BIGNUM *r,BIGNUM *a,BIGNUM *b, BN_CTX *ctx); + +int bn_mull(r,a,b,ctx) +BIGNUM *r,*a,*b; +BN_CTX *ctx; + { + int top,i,j,k,al,bl; + BIGNUM *t; + +#ifdef BN_COUNT +printf("bn_mull %d * %d\n",a->top,b->top); +#endif + + bn_check_top(a); + bn_check_top(b); + bn_check_top(r); + + al=a->top; + bl=b->top; + r->neg=a->neg^b->neg; + + top=al+bl; + if ((al < 4) || (bl < 4)) + { + if (bn_wexpand(r,top) == NULL) return(0); + r->top=top; + bn_mul_normal(r->d,a->d,al,b->d,bl); + goto end; + } + else if (al == bl) /* A good start, they are the same size */ + goto symetric; + else + { + i=(al-bl); + if ((i == 1) && !BN_get_flags(b,BN_FLG_STATIC_DATA)) + { + bn_wexpand(b,al); + b->d[bl]=0; + bl++; + goto symetric; + } + else if ((i == -1) && !BN_get_flags(a,BN_FLG_STATIC_DATA)) + { + bn_wexpand(a,bl); + a->d[al]=0; + al++; + goto symetric; + } + } + + /* asymetric and >= 4 */ + if (bn_wexpand(r,top) == NULL) return(0); + r->top=top; + bn_mul_normal(r->d,a->d,al,b->d,bl); + + if (0) + { + /* symetric and > 4 */ +symetric: + if (al == 4) + { + if (bn_wexpand(r,al*2) == NULL) return(0); + r->top=top; + bn_mul_comba4(r->d,a->d,b->d); + goto end; + } + if (al == 8) + { + if (bn_wexpand(r,al*2) == NULL) return(0); + r->top=top; + bn_mul_comba8(r->d,a->d,b->d); + goto end; + } + if (al <= BN_MULL_NORMAL_SIZE) + { + if (bn_wexpand(r,al*2) == NULL) return(0); + r->top=top; + bn_mul_normal(r->d,a->d,al,b->d,bl); + goto end; + } + /* 16 or larger */ + j=BN_num_bits_word((BN_ULONG)al); + j=1<<(j-1); + k=j+j; + t= &(ctx->bn[ctx->tos]); + if (al == j) /* exact multiple */ + { + bn_wexpand(t,k*2); + bn_wexpand(r,k*2); + bn_mul_recursive(r->d,a->d,b->d,al,t->d); + } + else + { + bn_wexpand(a,k); + bn_wexpand(b,k); + bn_wexpand(t,k*4); + bn_wexpand(r,k*4); + for (i=a->top; i<k; i++) + a->d[i]=0; + for (i=b->top; i<k; i++) + b->d[i]=0; + bn_mul_part_recursive(r->d,a->d,b->d,al-j,j,t->d); + } + r->top=top; + } +end: + bn_fix_top(r); + return(1); + } +#endif + +void bn_mul_normal(r,a,na,b,nb) +BN_ULONG *r,*a; +int na; +BN_ULONG *b; +int nb; + { + BN_ULONG *rr; + +#ifdef BN_COUNT +printf(" bn_mul_normal %d * %d\n",na,nb); +#endif + + if (na < nb) + { + int itmp; + BN_ULONG *ltmp; + + itmp=na; na=nb; nb=itmp; + ltmp=a; a=b; b=ltmp; + + } + rr= &(r[na]); + rr[0]=bn_mul_words(r,a,na,b[0]); + + for (;;) + { + if (--nb <= 0) return; + rr[1]=bn_mul_add_words(&(r[1]),a,na,b[1]); + if (--nb <= 0) return; + rr[2]=bn_mul_add_words(&(r[2]),a,na,b[2]); + if (--nb <= 0) return; + rr[3]=bn_mul_add_words(&(r[3]),a,na,b[3]); + if (--nb <= 0) return; + rr[4]=bn_mul_add_words(&(r[4]),a,na,b[4]); + rr+=4; + r+=4; + b+=4; + } + } + +#if 1 +void bn_mul_low_normal(r,a,b,n) +BN_ULONG *r,*a,*b; +int n; + { +#ifdef BN_COUNT +printf(" bn_mul_low_normal %d * %d\n",n,n); +#endif + bn_mul_words(r,a,n,b[0]); + + for (;;) + { + if (--n <= 0) return; + bn_mul_add_words(&(r[1]),a,n,b[1]); + if (--n <= 0) return; + bn_mul_add_words(&(r[2]),a,n,b[2]); + if (--n <= 0) return; + bn_mul_add_words(&(r[3]),a,n,b[3]); + if (--n <= 0) return; + bn_mul_add_words(&(r[4]),a,n,b[4]); + r+=4; + b+=4; + } + } +#endif diff --git a/crypto/bn/old/build b/crypto/bn/old/build new file mode 100755 index 0000000000..8cd99e5f17 --- /dev/null +++ b/crypto/bn/old/build @@ -0,0 +1,3 @@ +#!/bin/sh -x + +gcc -g -I../../include test.c -L../.. -lcrypto diff --git a/crypto/bn/old/info b/crypto/bn/old/info new file mode 100644 index 0000000000..5ac99c3b23 --- /dev/null +++ b/crypto/bn/old/info @@ -0,0 +1,22 @@ +Given A1A0 * B1B0 == S3S2S1S0 + +S0= low(A0*B0) +S1= low( (A1-A0)*(B0-B1)) +low( A1*B1) +high(A0*B0) +S2= high((A1-A0)*(B0-B1)) +high(A1*B1) +low( A1*B1) +S3= high(A1*B1); + +Assume we know S1 and S0, and can calulate A1*B1 and high((A1-A0)*(B0-B1)) + +k0= S0 == low(A0*B0) +k1= S1 +k2= low( A1*B1) +k3= high(A1*B1) +k4= high((A1-A0)*(B0-B1)) + +k1= low((A1-A0)*(B0-B1)) +k2 +high(A0*B0) +S2= k4 +k3 +k2 +S3= k3 + +S1-k2= low((A1-A0)*(B0-B1)) +high(A0*B0) + +We potentially have a carry or a borrow from S1 diff --git a/crypto/bn/old/test.works b/crypto/bn/old/test.works new file mode 100644 index 0000000000..127c7b415d --- /dev/null +++ b/crypto/bn/old/test.works @@ -0,0 +1,205 @@ +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +#define SIZE 128 + +#define BN_MONT_CTX_set bn_mcs +#define BN_from_montgomery bn_fm +#define BN_mod_mul_montgomery bn_mmm +#undef BN_to_montgomery +#define BN_to_montgomery(r,a,mont,ctx) bn_mmm(\ + r,a,(mont)->RR,(mont),ctx) + +main() + { + BIGNUM prime,a,b,r,A,B,R; + BN_MONT_CTX *mont; + BN_CTX *ctx; + int i; + + ctx=BN_CTX_new(); + BN_init(&prime); + BN_init(&a); BN_init(&b); BN_init(&r); + BN_init(&A); BN_init(&B); BN_init(&R); + + BN_generate_prime(&prime,SIZE,0,NULL,NULL,NULL,NULL); + BN_rand(&A,SIZE,1,0); + BN_rand(&B,SIZE,1,0); + BN_mod(&A,&A,&prime,ctx); + BN_mod(&B,&B,&prime,ctx); + + mont=BN_MONT_CTX_new(); + BN_MONT_CTX_set(mont,&prime,ctx); + + BN_to_montgomery(&a,&A,mont,ctx); + BN_to_montgomery(&b,&B,mont,ctx); + + BN_mul(&r,&a,&b); + BN_print_fp(stdout,&r); printf("\n"); + BN_from_montgomery(&r,&r,mont,ctx); + BN_print_fp(stdout,&r); printf("\n"); + BN_from_montgomery(&r,&r,mont,ctx); + BN_print_fp(stdout,&r); printf("\n"); + + BN_mod_mul(&R,&A,&B,&prime,ctx); + + BN_print_fp(stdout,&a); printf("\n"); + BN_print_fp(stdout,&b); printf("\n"); + BN_print_fp(stdout,&prime); printf("\n"); + BN_print_fp(stdout,&r); printf("\n\n"); + + BN_print_fp(stdout,&A); printf("\n"); + BN_print_fp(stdout,&B); printf("\n"); + BN_print_fp(stdout,&prime); printf("\n"); + BN_print_fp(stdout,&R); printf("\n\n"); + + BN_mul(&r,&a,&b); + BN_print_fp(stdout,&r); printf(" <- BA*DC\n"); + BN_copy(&A,&r); + i=SIZE/2; + BN_mask_bits(&A,i*2); +// BN_print_fp(stdout,&A); printf(" <- low(BA*DC)\n"); + bn_do_lower(&r,&a,&b,&A,i); +// BN_print_fp(stdout,&r); printf(" <- low(BA*DC)\n"); + } + +int bn_mul_low(r,a,b,low,i) +BIGNUM *r,*a,*b,*low; +int i; + { + int w; + BIGNUM Kh,Km,t1,t2,h,ah,al,bh,bl,l,m,s0,s1; + + BN_init(&Kh); BN_init(&Km); BN_init(&t1); BN_init(&t2); BN_init(&l); + BN_init(&ah); BN_init(&al); BN_init(&bh); BN_init(&bl); BN_init(&h); + BN_init(&m); BN_init(&s0); BN_init(&s1); + + BN_copy(&al,a); BN_mask_bits(&al,i); BN_rshift(&ah,a,i); + BN_copy(&bl,b); BN_mask_bits(&bl,i); BN_rshift(&bh,b,i); + + + BN_sub(&t1,&al,&ah); + BN_sub(&t2,&bh,&bl); + BN_mul(&m,&t1,&t2); + BN_mul(&h,&ah,&bh); + + BN_copy(&s0,low); BN_mask_bits(&s0,i); + BN_rshift(&s1,low,i); + + BN_add(&t1,&h,&m); + BN_add(&t1,&t1,&s0); + + BN_copy(&t2,&t1); BN_mask_bits(&t2,i); + BN_sub(&t1,&s1,&t2); + BN_lshift(&t1,&t1,i); + BN_add(&t1,&t1,&s0); + if (t1.neg) + { + BN_lshift(&t2,BN_value_one(),i*2); + BN_add(&t1,&t2,&t1); + BN_mask_bits(&t1,i*2); + } + + BN_free(&Kh); BN_free(&Km); BN_free(&t1); BN_free(&t2); + BN_free(&ah); BN_free(&al); BN_free(&bh); BN_free(&bl); + } + +int BN_mod_mul_montgomery(r,a,b,mont,ctx) +BIGNUM *r,*a,*b; +BN_MONT_CTX *mont; +BN_CTX *ctx; + { + BIGNUM *tmp; + + tmp= &(ctx->bn[ctx->tos++]); + + if (a == b) + { + if (!BN_sqr(tmp,a,ctx)) goto err; + } + else + { + if (!BN_mul(tmp,a,b)) goto err; + } + /* reduce from aRR to aR */ + if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err; + ctx->tos--; + return(1); +err: + return(0); + } + +int BN_from_montgomery(r,a,mont,ctx) +BIGNUM *r; +BIGNUM *a; +BN_MONT_CTX *mont; +BN_CTX *ctx; + { + BIGNUM z1; + BIGNUM *t1,*t2; + BN_ULONG *ap,*bp,*rp; + int j,i,bl,al; + + BN_init(&z1); + t1= &(ctx->bn[ctx->tos]); + t2= &(ctx->bn[ctx->tos+1]); + + if (!BN_copy(t1,a)) goto err; + /* can cheat */ + BN_mask_bits(t1,mont->ri); + if (!BN_mul(t2,t1,mont->Ni)) goto err; + BN_mask_bits(t2,mont->ri); + + if (!BN_mul(t1,t2,mont->N)) goto err; + if (!BN_add(t2,t1,a)) goto err; + + /* At this point, t2 has the bottom ri bits set to zero. + * This means that the bottom ri bits == the 1^ri minus the bottom + * ri bits of a. + * This means that only the bits above 'ri' in a need to be added, + * and XXXXXXXXXXXXXXXXXXXXXXXX + */ +BN_print_fp(stdout,t2); printf("\n"); + BN_rshift(r,t2,mont->ri); + + if (BN_ucmp(r,mont->N) >= 0) + bn_qsub(r,r,mont->N); + + return(1); +err: + return(0); + } + +int BN_MONT_CTX_set(mont,mod,ctx) +BN_MONT_CTX *mont; +BIGNUM *mod; +BN_CTX *ctx; + { + BIGNUM *Ri=NULL,*R=NULL; + + if (mont->RR == NULL) mont->RR=BN_new(); + if (mont->N == NULL) mont->N=BN_new(); + + R=mont->RR; /* grab RR as a temp */ + BN_copy(mont->N,mod); /* Set N */ + + mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; + BN_lshift(R,BN_value_one(),mont->ri); /* R */ + if ((Ri=BN_mod_inverse(NULL,R,mod,ctx)) == NULL) goto err;/* Ri */ + BN_lshift(Ri,Ri,mont->ri); /* R*Ri */ + bn_qsub(Ri,Ri,BN_value_one()); /* R*Ri - 1 */ + BN_div(Ri,NULL,Ri,mod,ctx); + if (mont->Ni != NULL) BN_free(mont->Ni); + mont->Ni=Ri; /* Ni=(R*Ri-1)/N */ + + /* setup RR for conversions */ + BN_lshift(mont->RR,BN_value_one(),mont->ri*2); + BN_mod(mont->RR,mont->RR,mont->N,ctx); + + return(1); +err: + return(0); + } + + diff --git a/crypto/bn/test.c b/crypto/bn/test.c new file mode 100644 index 0000000000..e23f21583f --- /dev/null +++ b/crypto/bn/test.c @@ -0,0 +1,252 @@ +#include <stdio.h> +#include "cryptlib.h" +#include "bn_lcl.h" + +#define SIZE 32 + +#define BN_MONT_CTX_set bn_mcs +#define BN_from_montgomery bn_fm +#define BN_mod_mul_montgomery bn_mmm +#undef BN_to_montgomery +#define BN_to_montgomery(r,a,mont,ctx) bn_mmm(\ + r,a,(mont)->RR,(mont),ctx) + +main() + { + BIGNUM prime,a,b,r,A,B,R; + BN_MONT_CTX *mont; + BN_CTX *ctx; + int i; + + ctx=BN_CTX_new(); + BN_init(&prime); + BN_init(&a); BN_init(&b); BN_init(&r); + BN_init(&A); BN_init(&B); BN_init(&R); + + BN_generate_prime(&prime,SIZE,0,NULL,NULL,NULL,NULL); + BN_rand(&A,SIZE,1,0); + BN_rand(&B,SIZE,1,0); + BN_mod(&A,&A,&prime,ctx); + BN_mod(&B,&B,&prime,ctx); + + i=A.top; + BN_mul(&R,&A,&B,ctx); + BN_mask_bits(&R,i*BN_BITS2); + + + BN_print_fp(stdout,&A); printf(" <- a\n"); + BN_print_fp(stdout,&B); printf(" <- b\n"); + BN_mul_high(&r,&A,&B,&R,i); + BN_print_fp(stdout,&r); printf(" <- high(BA*DC)\n"); + + BN_mask_bits(&A,i*32); + BN_mask_bits(&B,i*32); + + BN_mul(&R,&A,&B); + BN_rshift(&R,&R,i*32); + BN_print_fp(stdout,&R); printf(" <- norm BA*DC\n"); + BN_sub(&R,&R,&r); + BN_print_fp(stdout,&R); printf(" <- diff\n"); + } + +#if 0 +int bn_mul_high(r,a,b,low,words) +BIGNUM *r,*a,*b,*low; +int words; + { + int i; + BIGNUM t1,t2,t3,h,ah,al,bh,bl,m,s0,s1; + + BN_init(&al); BN_init(&ah); + BN_init(&bl); BN_init(&bh); + BN_init(&t1); BN_init(&t2); BN_init(&t3); + BN_init(&s0); BN_init(&s1); + BN_init(&h); BN_init(&m); + + i=a->top; + if (i >= words) + { + al.top=words; + ah.top=a->top-words; + ah.d= &(a->d[ah.top]); + } + else + al.top=i; + al.d=a->d; + + i=b->top; + if (i >= words) + { + bl.top=words; + bh.top=i-words; + bh.d= &(b->d[bh.top]); + } + else + bl.top=i; + bl.d=b->d; + + i=low->top; + if (i >= words) + { + s0.top=words; + s1.top=i-words; + s1.d= &(low->d[s1.top]); + } + else + s0.top=i; + s0.d=low->d; + +al.max=al.top; ah.max=ah.top; +bl.max=bl.top; bh.max=bh.top; +s0.max=bl.top; s1.max=bh.top; + + /* Calculate (al-ah)*(bh-bl) */ + BN_sub(&t1,&al,&ah); + BN_sub(&t2,&bh,&bl); + BN_mul(&m,&t1,&t2); + + /* Calculate ah*bh */ + BN_mul(&h,&ah,&bh); + + /* s0 == low(al*bl) + * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl) + * We know s0 and s1 so the only unknown is high(al*bl) + * high(al*bl) == s1 - low(ah*bh+(al-ah)*(bh-bl)+s0) + */ + BN_add(&m,&m,&h); + BN_add(&t2,&m,&s0); + /* Quick and dirty mask off of high words */ + t3.d=t2.d; + t3.top=(t2.top > words)?words:t2.top; + t3.neg=t2.neg; +t3.max=t3.top; +// BN_print_fp(stdout,&s1); printf(" s1\n"); +// BN_print_fp(stdout,&t2); printf(" middle value\n"); +// BN_print_fp(stdout,&t3); printf(" low middle value\n"); + BN_sub(&t1,&s1,&t3); + + if (t1.neg) + { +//printf("neg fixup\n"); //BN_print_fp(stdout,&t1); printf(" before\n"); + BN_lshift(&t2,BN_value_one(),words*32); + BN_add(&t1,&t2,&t1); + BN_mask_bits(&t1,words*32); +// BN_print_fp(stdout,&t1); printf(" after\n"); + } + /* al*bl == high(al*bl)<<words+s0 */ + BN_lshift(&t1,&t1,words*32); + BN_add(&t1,&t1,&s0); + + /* We now have + * al*bl - t1 + * (al-ah)*(bh-bl)+ah*bh - m + * ah*bh - h + */ + BN_copy(r,&t1); + BN_mask_bits(r,words*32*2); + + /*BN_lshift(&m,&m,words*/ + + BN_free(&t1); BN_free(&t2); + BN_free(&m); BN_free(&h); + } + +int BN_mod_mul_montgomery(r,a,b,mont,ctx) +BIGNUM *r,*a,*b; +BN_MONT_CTX *mont; +BN_CTX *ctx; + { + BIGNUM *tmp; + + tmp= &(ctx->bn[ctx->tos++]); + + if (a == b) + { + if (!BN_sqr(tmp,a,ctx)) goto err; + } + else + { + if (!BN_mul(tmp,a,b)) goto err; + } + /* reduce from aRR to aR */ + if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err; + ctx->tos--; + return(1); +err: + return(0); + } + +int BN_from_montgomery(r,a,mont,ctx) +BIGNUM *r; +BIGNUM *a; +BN_MONT_CTX *mont; +BN_CTX *ctx; + { + BIGNUM z1; + BIGNUM *t1,*t2; + BN_ULONG *ap,*bp,*rp; + int j,i,bl,al; + + BN_init(&z1); + t1= &(ctx->bn[ctx->tos]); + t2= &(ctx->bn[ctx->tos+1]); + + if (!BN_copy(t1,a)) goto err; + /* can cheat */ + BN_mask_bits(t1,mont->ri); + if (!BN_mul(t2,t1,mont->Ni)) goto err; + BN_mask_bits(t2,mont->ri); + + if (!BN_mul(t1,t2,mont->N)) goto err; + if (!BN_add(t2,t1,a)) goto err; + + /* At this point, t2 has the bottom ri bits set to zero. + * This means that the bottom ri bits == the 1^ri minus the bottom + * ri bits of a. + * This means that only the bits above 'ri' in a need to be added, + * and XXXXXXXXXXXXXXXXXXXXXXXX + */ +BN_print_fp(stdout,t2); printf("\n"); + BN_rshift(r,t2,mont->ri); + + if (BN_ucmp(r,mont->N) >= 0) + BN_usub(r,r,mont->N); + + return(1); +err: + return(0); + } + +int BN_MONT_CTX_set(mont,mod,ctx) +BN_MONT_CTX *mont; +BIGNUM *mod; +BN_CTX *ctx; + { + BIGNUM *Ri=NULL,*R=NULL; + + if (mont->RR == NULL) mont->RR=BN_new(); + if (mont->N == NULL) mont->N=BN_new(); + + R=mont->RR; /* grab RR as a temp */ + BN_copy(mont->N,mod); /* Set N */ + + mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; + BN_lshift(R,BN_value_one(),mont->ri); /* R */ + if ((Ri=BN_mod_inverse(NULL,R,mod,ctx)) == NULL) goto err;/* Ri */ + BN_lshift(Ri,Ri,mont->ri); /* R*Ri */ + BN_usub(Ri,Ri,BN_value_one()); /* R*Ri - 1 */ + BN_div(Ri,NULL,Ri,mod,ctx); + if (mont->Ni != NULL) BN_free(mont->Ni); + mont->Ni=Ri; /* Ni=(R*Ri-1)/N */ + + /* setup RR for conversions */ + BN_lshift(mont->RR,BN_value_one(),mont->ri*2); + BN_mod(mont->RR,mont->RR,mont->N,ctx); + + return(1); +err: + return(0); + } + + +#endif diff --git a/crypto/bn/todo b/crypto/bn/todo new file mode 100644 index 0000000000..e47e381aea --- /dev/null +++ b/crypto/bn/todo @@ -0,0 +1,3 @@ +Cache RECP_CTX values +make the result argument independant of the inputs. +split up the _exp_ functions |