diff options
author | Ralf S. Engelschall <rse@openssl.org> | 1998-12-21 10:52:47 +0000 |
---|---|---|
committer | Ralf S. Engelschall <rse@openssl.org> | 1998-12-21 10:52:47 +0000 |
commit | d02b48c63a58ea4367a0e905979f140b7d090f86 (patch) | |
tree | 504f62ed3d84799f785b9cd9fab255a21b0e1b0e /crypto/bn/asm | |
download | openssl-d02b48c63a58ea4367a0e905979f140b7d090f86.tar.gz |
Import of old SSLeay release: SSLeay 0.8.1b
Diffstat (limited to 'crypto/bn/asm')
-rw-r--r-- | crypto/bn/asm/README | 30 | ||||
-rw-r--r-- | crypto/bn/asm/alpha.s | 310 | ||||
-rw-r--r-- | crypto/bn/asm/pa-risc.s | 710 | ||||
-rw-r--r-- | crypto/bn/asm/pa-risc2.s | 416 | ||||
-rw-r--r-- | crypto/bn/asm/r3000.s | 646 | ||||
-rw-r--r-- | crypto/bn/asm/sparc.s | 359 | ||||
-rw-r--r-- | crypto/bn/asm/x86-bsdi.s | 272 | ||||
-rw-r--r-- | crypto/bn/asm/x86-lnx.s | 282 | ||||
-rw-r--r-- | crypto/bn/asm/x86-lnxa.s | 282 | ||||
-rw-r--r-- | crypto/bn/asm/x86-sol.s | 224 | ||||
-rw-r--r-- | crypto/bn/asm/x86nt32.asm | 288 | ||||
-rw-r--r-- | crypto/bn/asm/x86nt32.uu | 22 | ||||
-rw-r--r-- | crypto/bn/asm/x86w16.asm | 297 | ||||
-rw-r--r-- | crypto/bn/asm/x86w16.uu | 20 | ||||
-rw-r--r-- | crypto/bn/asm/x86w32.asm | 303 | ||||
-rw-r--r-- | crypto/bn/asm/x86w32.uu | 23 |
16 files changed, 4484 insertions, 0 deletions
diff --git a/crypto/bn/asm/README b/crypto/bn/asm/README new file mode 100644 index 0000000000..d93fbff77f --- /dev/null +++ b/crypto/bn/asm/README @@ -0,0 +1,30 @@ +All assember in this directory are just version of the file +crypto/bn/bn_mulw.c. + +Quite a few of these files are just the assember output from gcc since on +quite a few machines they are 2 times faster than the system compiler. + +For the x86, I have hand written assember because of the bad job all +compilers seem to do on it. This normally gives a 2 time speed up in the RSA +routines. + +For the DEC alpha, I also hand wrote the assember (except the division which +is just the output from the C compiler pasted on the end of the file). +On the 2 alpha C compilers I had access to, it was not possible to do +64b x 64b -> 128b calculations (both long and the long long data types +were 64 bits). So the hand assember gives access to the 128 bit result and +a 2 times speedup :-). + +The x86xxxx.obj files are the assembled version of x86xxxx.asm files. +I had such a hard time finding a macro assember for Microsoft, I decided to +include the object file to save others the hassle :-). + +I have also included uu encoded versions of the .obj incase they get +trashed. + +There are 2 versions of assember for the HP PA-RISC. +pa-risc.s is the origional one which works fine. +pa-risc2.s is a new version that often generates warnings but if the +tests pass, it gives performance that is over 2 times faster than +pa-risc.s. +Both were generated using gcc :-) diff --git a/crypto/bn/asm/alpha.s b/crypto/bn/asm/alpha.s new file mode 100644 index 0000000000..d56f715ecd --- /dev/null +++ b/crypto/bn/asm/alpha.s @@ -0,0 +1,310 @@ + # DEC Alpha assember + # The bn_div64 is actually gcc output but the other parts are hand done. + # Thanks to tzeruch@ceddec.com for sending me the gcc output for + # bn_div64. + .file 1 "bn_mulw.c" + .version "01.01" + .set noat +gcc2_compiled.: +__gnu_compiled_c: + .text + .align 3 + .globl bn_mul_add_word + .ent bn_mul_add_word +bn_mul_add_word: +bn_mul_add_word..ng: + .frame $30,0,$26,0 + .prologue 0 + subq $18,2,$25 # num=-2 + bis $31,$31,$0 + blt $25,$42 + .align 5 +$142: + subq $18,2,$18 # num-=2 + subq $25,2,$25 # num-=2 + + ldq $1,0($17) # a[0] + ldq $2,8($17) # a[1] + + mulq $19,$1,$3 # a[0]*w low part r3 + umulh $19,$1,$1 # a[0]*w high part r1 + mulq $19,$2,$4 # a[1]*w low part r4 + umulh $19,$2,$2 # a[1]*w high part r2 + + ldq $22,0($16) # r[0] r22 + ldq $23,8($16) # r[1] r23 + + addq $3,$22,$3 # a0 low part + r[0] + addq $4,$23,$4 # a1 low part + r[1] + cmpult $3,$22,$5 # overflow? + cmpult $4,$23,$6 # overflow? + addq $5,$1,$1 # high part + overflow + addq $6,$2,$2 # high part + overflow + + addq $3,$0,$3 # add c + cmpult $3,$0,$5 # overflow? + stq $3,0($16) + addq $5,$1,$0 # c=high part + overflow + + addq $4,$0,$4 # add c + cmpult $4,$0,$5 # overflow? + stq $4,8($16) + addq $5,$2,$0 # c=high part + overflow + + ble $18,$43 + + addq $16,16,$16 + addq $17,16,$17 + blt $25,$42 + + br $31,$142 +$42: + ldq $1,0($17) # a[0] + umulh $19,$1,$3 # a[0]*w high part + mulq $19,$1,$1 # a[0]*w low part + ldq $2,0($16) # r[0] + addq $1,$2,$1 # low part + r[0] + cmpult $1,$2,$4 # overflow? + addq $4,$3,$3 # high part + overflow + addq $1,$0,$1 # add c + cmpult $1,$0,$4 # overflow? + addq $4,$3,$0 # c=high part + overflow + stq $1,0($16) + + .align 4 +$43: + ret $31,($26),1 + .end bn_mul_add_word + .align 3 + .globl bn_mul_word + .ent bn_mul_word +bn_mul_word: +bn_mul_word..ng: + .frame $30,0,$26,0 + .prologue 0 + subq $18,2,$25 # num=-2 + bis $31,$31,$0 + blt $25,$242 + .align 5 +$342: + subq $18,2,$18 # num-=2 + subq $25,2,$25 # num-=2 + + ldq $1,0($17) # a[0] + ldq $2,8($17) # a[1] + + mulq $19,$1,$3 # a[0]*w low part r3 + umulh $19,$1,$1 # a[0]*w high part r1 + mulq $19,$2,$4 # a[1]*w low part r4 + umulh $19,$2,$2 # a[1]*w high part r2 + + addq $3,$0,$3 # add c + cmpult $3,$0,$5 # overflow? + stq $3,0($16) + addq $5,$1,$0 # c=high part + overflow + + addq $4,$0,$4 # add c + cmpult $4,$0,$5 # overflow? + stq $4,8($16) + addq $5,$2,$0 # c=high part + overflow + + ble $18,$243 + + addq $16,16,$16 + addq $17,16,$17 + blt $25,$242 + + br $31,$342 +$242: + ldq $1,0($17) # a[0] + umulh $19,$1,$3 # a[0]*w high part + mulq $19,$1,$1 # a[0]*w low part + addq $1,$0,$1 # add c + cmpult $1,$0,$4 # overflow? + addq $4,$3,$0 # c=high part + overflow + stq $1,0($16) +$243: + ret $31,($26),1 + .end bn_mul_word + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18,2,$25 # num=-2 + blt $25,$442 + .align 5 +$542: + subq $18,2,$18 # num-=2 + subq $25,2,$25 # num-=2 + + ldq $1,0($17) # a[0] + ldq $4,8($17) # a[1] + + mulq $1,$1,$2 # a[0]*w low part r2 + umulh $1,$1,$3 # a[0]*w high part r3 + mulq $4,$4,$5 # a[1]*w low part r5 + umulh $4,$4,$6 # a[1]*w high part r6 + + stq $2,0($16) # r[0] + stq $3,8($16) # r[1] + stq $5,16($16) # r[3] + stq $6,24($16) # r[4] + + ble $18,$443 + + addq $16,32,$16 + addq $17,16,$17 + blt $25,$442 + br $31,$542 + +$442: + ldq $1,0($17) # a[0] + mulq $1,$1,$2 # a[0]*w low part r2 + umulh $1,$1,$3 # a[0]*w high part r3 + stq $2,0($16) # r[0] + stq $3,8($16) # r[1] + + .align 4 +$443: + ret $31,($26),1 + .end bn_sqr_words + + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$119 + lda $0,-1 + br $31,$136 + .align 4 +$119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$126 + zapnot $7,15,$27 + br $31,$127 + .align 4 +$126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$127: + srl $10,32,$4 + .align 5 +$128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$129 + subq $27,1,$27 + br $31,$128 + .align 4 +$129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$134 + addq $9,$11,$9 + subq $27,1,$27 +$134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$123 + .align 4 +$124: + bis $13,$27,$0 +$136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 + .ident "GCC: (GNU) 2.7.2.1" + + diff --git a/crypto/bn/asm/pa-risc.s b/crypto/bn/asm/pa-risc.s new file mode 100644 index 0000000000..c49c433a83 --- /dev/null +++ b/crypto/bn/asm/pa-risc.s @@ -0,0 +1,710 @@ + .SPACE $PRIVATE$ + .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31 + .SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82 + .SPACE $TEXT$ + .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44 + .SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY + .IMPORT $global$,DATA + .IMPORT $$dyncall,MILLICODE +; gcc_compiled.: + .SPACE $TEXT$ + .SUBSPA $CODE$ + + .align 4 + .EXPORT bn_mul_add_word,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR +bn_mul_add_word + .PROC + .CALLINFO FRAME=0,CALLS,SAVE_RP + .ENTRY + stw %r2,-20(0,%r30) + ldi 0,%r28 + extru %r23,31,16,%r2 + stw %r2,-16(0,%r30) + extru %r23,15,16,%r23 + ldil L'65536,%r31 + fldws -16(0,%r30),%fr11R + stw %r23,-16(0,%r30) + ldo 12(%r25),%r29 + ldo 12(%r26),%r23 + fldws -16(0,%r30),%fr11L +L$0002 + ldw 0(0,%r25),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0005 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi 1,%r19,%r19 + ldw 0(0,%r26),%r28 + addl %r20,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0003 + stw %r20,0(0,%r26) + ldw -8(0,%r29),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0010 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi 1,%r19,%r19 + ldw -8(0,%r23),%r28 + addl %r20,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0003 + stw %r20,-8(0,%r23) + ldw -4(0,%r29),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0015 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi 1,%r19,%r19 + ldw -4(0,%r23),%r28 + addl %r20,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0003 + stw %r20,-4(0,%r23) + ldw 0(0,%r29),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0020 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi 1,%r19,%r19 + ldw 0(0,%r23),%r28 + addl %r20,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0003 + stw %r20,0(0,%r23) + ldo 16(%r29),%r29 + ldo 16(%r25),%r25 + ldo 16(%r23),%r23 + bl L$0002,0 + ldo 16(%r26),%r26 +L$0003 + ldw -20(0,%r30),%r2 + bv,n 0(%r2) + .EXIT + .PROCEND + .align 4 + .EXPORT bn_mul_word,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR +bn_mul_word + .PROC + .CALLINFO FRAME=0,CALLS,SAVE_RP + .ENTRY + stw %r2,-20(0,%r30) + ldi 0,%r28 + extru %r23,31,16,%r2 + stw %r2,-16(0,%r30) + extru %r23,15,16,%r23 + ldil L'65536,%r31 + fldws -16(0,%r30),%fr11R + stw %r23,-16(0,%r30) + ldo 12(%r26),%r29 + ldo 12(%r25),%r23 + fldws -16(0,%r30),%fr11L +L$0026 + ldw 0(0,%r25),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0029 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0027 + stw %r20,0(0,%r26) + ldw -8(0,%r23),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0033 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0027 + stw %r20,-8(0,%r29) + ldw -4(0,%r23),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0037 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0027 + stw %r20,-4(0,%r29) + ldw 0(0,%r23),%r19 + extru %r19,31,16,%r20 + stw %r20,-16(0,%r30) + extru %r19,15,16,%r19 + fldws -16(0,%r30),%fr22L + stw %r19,-16(0,%r30) + xmpyu %fr22L,%fr11R,%fr8 + fldws -16(0,%r30),%fr22L + fstws %fr8R,-16(0,%r30) + xmpyu %fr11R,%fr22L,%fr10 + ldw -16(0,%r30),%r2 + stw %r20,-16(0,%r30) + xmpyu %fr22L,%fr11L,%fr9 + fldws -16(0,%r30),%fr22L + fstws %fr10R,-16(0,%r30) + copy %r2,%r22 + ldw -16(0,%r30),%r2 + fstws %fr9R,-16(0,%r30) + xmpyu %fr11L,%fr22L,%fr8 + copy %r2,%r19 + ldw -16(0,%r30),%r2 + fstws %fr8R,-16(0,%r30) + copy %r2,%r20 + ldw -16(0,%r30),%r2 + addl %r2,%r19,%r21 + comclr,<<= %r19,%r21,0 + addl %r20,%r31,%r20 +L$0041 + extru %r21,15,16,%r19 + addl %r20,%r19,%r20 + zdep %r21,15,16,%r19 + addl %r22,%r19,%r22 + comclr,<<= %r19,%r22,0 + addi,tr 1,%r20,%r19 + copy %r20,%r19 + addl %r22,%r28,%r20 + comclr,<<= %r28,%r20,0 + addi,tr 1,%r19,%r28 + copy %r19,%r28 + addib,= -1,%r24,L$0027 + stw %r20,0(0,%r29) + ldo 16(%r23),%r23 + ldo 16(%r25),%r25 + ldo 16(%r29),%r29 + bl L$0026,0 + ldo 16(%r26),%r26 +L$0027 + ldw -20(0,%r30),%r2 + bv,n 0(%r2) + .EXIT + .PROCEND + .align 4 + .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR +bn_sqr_words + .PROC + .CALLINFO FRAME=0,NO_CALLS + .ENTRY + ldo 28(%r26),%r23 + ldo 12(%r25),%r28 +L$0046 + ldw 0(0,%r25),%r21 + extru %r21,31,16,%r22 + stw %r22,-16(0,%r30) + extru %r21,15,16,%r21 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + ldw -16(0,%r30),%r29 + stw %r22,-16(0,%r30) + fldws -16(0,%r30),%fr10R + stw %r21,-16(0,%r30) + copy %r29,%r19 + xmpyu %fr10L,%fr10R,%fr8 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + fstws %fr8R,-16(0,%r30) + extru %r19,16,17,%r20 + zdep %r19,14,15,%r19 + ldw -16(0,%r30),%r29 + xmpyu %fr10L,%fr10R,%fr9 + addl %r29,%r19,%r22 + stw %r22,0(0,%r26) + fstws %fr9R,-16(0,%r30) + ldw -16(0,%r30),%r29 + addl %r29,%r20,%r21 + comclr,<<= %r19,%r22,0 + addi 1,%r21,%r21 + addib,= -1,%r24,L$0057 + stw %r21,-24(0,%r23) + ldw -8(0,%r28),%r21 + extru %r21,31,16,%r22 + stw %r22,-16(0,%r30) + extru %r21,15,16,%r21 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + ldw -16(0,%r30),%r29 + stw %r22,-16(0,%r30) + fldws -16(0,%r30),%fr10R + stw %r21,-16(0,%r30) + copy %r29,%r19 + xmpyu %fr10L,%fr10R,%fr8 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + fstws %fr8R,-16(0,%r30) + extru %r19,16,17,%r20 + zdep %r19,14,15,%r19 + ldw -16(0,%r30),%r29 + xmpyu %fr10L,%fr10R,%fr9 + addl %r29,%r19,%r22 + stw %r22,-20(0,%r23) + fstws %fr9R,-16(0,%r30) + ldw -16(0,%r30),%r29 + addl %r29,%r20,%r21 + comclr,<<= %r19,%r22,0 + addi 1,%r21,%r21 + addib,= -1,%r24,L$0057 + stw %r21,-16(0,%r23) + ldw -4(0,%r28),%r21 + extru %r21,31,16,%r22 + stw %r22,-16(0,%r30) + extru %r21,15,16,%r21 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + ldw -16(0,%r30),%r29 + stw %r22,-16(0,%r30) + fldws -16(0,%r30),%fr10R + stw %r21,-16(0,%r30) + copy %r29,%r19 + xmpyu %fr10L,%fr10R,%fr8 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + fstws %fr8R,-16(0,%r30) + extru %r19,16,17,%r20 + zdep %r19,14,15,%r19 + ldw -16(0,%r30),%r29 + xmpyu %fr10L,%fr10R,%fr9 + addl %r29,%r19,%r22 + stw %r22,-12(0,%r23) + fstws %fr9R,-16(0,%r30) + ldw -16(0,%r30),%r29 + addl %r29,%r20,%r21 + comclr,<<= %r19,%r22,0 + addi 1,%r21,%r21 + addib,= -1,%r24,L$0057 + stw %r21,-8(0,%r23) + ldw 0(0,%r28),%r21 + extru %r21,31,16,%r22 + stw %r22,-16(0,%r30) + extru %r21,15,16,%r21 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + ldw -16(0,%r30),%r29 + stw %r22,-16(0,%r30) + fldws -16(0,%r30),%fr10R + stw %r21,-16(0,%r30) + copy %r29,%r19 + xmpyu %fr10L,%fr10R,%fr8 + fldws -16(0,%r30),%fr10L + stw %r21,-16(0,%r30) + fldws -16(0,%r30),%fr10R + fstws %fr8R,-16(0,%r30) + extru %r19,16,17,%r20 + zdep %r19,14,15,%r19 + ldw -16(0,%r30),%r29 + xmpyu %fr10L,%fr10R,%fr9 + addl %r29,%r19,%r22 + stw %r22,-4(0,%r23) + fstws %fr9R,-16(0,%r30) + ldw -16(0,%r30),%r29 + addl %r29,%r20,%r21 + comclr,<<= %r19,%r22,0 + addi 1,%r21,%r21 + addib,= -1,%r24,L$0057 + stw %r21,0(0,%r23) + ldo 16(%r28),%r28 + ldo 16(%r25),%r25 + ldo 32(%r23),%r23 + bl L$0046,0 + ldo 32(%r26),%r26 +L$0057 + bv,n 0(%r2) + .EXIT + .PROCEND + .IMPORT BN_num_bits_word,CODE + .IMPORT fprintf,CODE + .IMPORT __iob,DATA + .SPACE $TEXT$ + .SUBSPA $LIT$ + + .align 4 +L$C0000 + .STRING "Division would overflow\x0a\x00" + .IMPORT abort,CODE + .SPACE $TEXT$ + .SUBSPA $CODE$ + + .align 4 + .EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR +bn_div64 + .PROC + .CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8 + .ENTRY + stw %r2,-20(0,%r30) + stwm %r8,128(0,%r30) + stw %r7,-124(0,%r30) + stw %r4,-112(0,%r30) + stw %r3,-108(0,%r30) + copy %r26,%r3 + copy %r25,%r4 + stw %r6,-120(0,%r30) + ldi 0,%r7 + stw %r5,-116(0,%r30) + movb,<> %r24,%r5,L$0059 + ldi 2,%r6 + bl L$0076,0 + ldi -1,%r28 +L$0059 + .CALL ARGW0=GR + bl BN_num_bits_word,%r2 + copy %r5,%r26 + ldi 32,%r19 + comb,= %r19,%r28,L$0060 + subi 31,%r28,%r19 + mtsar %r19 + zvdepi 1,32,%r19 + comb,>>= %r19,%r3,L$0060 + addil LR'__iob-$global$+32,%r27 + ldo RR'__iob-$global$+32(%r1),%r26 + ldil LR'L$C0000,%r25 + .CALL ARGW0=GR,ARGW1=GR + bl fprintf,%r2 + ldo RR'L$C0000(%r25),%r25 + .CALL + bl abort,%r2 + nop +L$0060 + comb,>> %r5,%r3,L$0061 + subi 32,%r28,%r28 + sub %r3,%r5,%r3 +L$0061 + comib,= 0,%r28,L$0062 + subi 31,%r28,%r19 + mtsar %r19 + zvdep %r5,32,%r5 + zvdep %r3,32,%r21 + subi 32,%r28,%r20 + mtsar %r20 + vshd 0,%r4,%r20 + or %r21,%r20,%r3 + mtsar %r19 + zvdep %r4,32,%r4 +L$0062 + extru %r5,15,16,%r23 + extru %r5,31,16,%r28 +L$0063 + extru %r3,15,16,%r19 + comb,<> %r23,%r19,L$0066 + copy %r3,%r26 + bl L$0067,0 + zdepi -1,31,16,%r29 +L$0066 + .IMPORT $$divU,MILLICODE + bl $$divU,%r31 + copy %r23,%r25 +L$0067 + stw %r29,-16(0,%r30) + fldws -16(0,%r30),%fr10L + stw %r28,-16(0,%r30) + fldws -16(0,%r30),%fr10R + stw %r23,-16(0,%r30) + xmpyu %fr10L,%fr10R,%fr8 + fldws -16(0,%r30),%fr10R + fstws %fr8R,-16(0,%r30) + xmpyu %fr10L,%fr10R,%fr9 + ldw -16(0,%r30),%r8 + fstws %fr9R,-16(0,%r30) + copy %r8,%r22 + ldw -16(0,%r30),%r8 + extru %r4,15,16,%r24 + copy %r8,%r21 +L$0068 + sub %r3,%r21,%r20 + copy %r20,%r19 + depi 0,31,16,%r19 + comib,<> 0,%r19,L$0069 + zdep %r20,15,16,%r19 + addl %r19,%r24,%r19 + comb,>>= %r19,%r22,L$0069 + sub %r22,%r28,%r22 + sub %r21,%r23,%r21 + bl L$0068,0 + ldo -1(%r29),%r29 +L$0069 + stw %r29,-16(0,%r30) + fldws -16(0,%r30),%fr10L + stw %r28,-16(0,%r30) + fldws -16(0,%r30),%fr10R + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + ldw -16(0,%r30),%r8 + stw %r23,-16(0,%r30) + fldws -16(0,%r30),%fr10R + copy %r8,%r19 + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + extru %r19,15,16,%r20 + ldw -16(0,%r30),%r8 + zdep %r19,15,16,%r19 + addl %r8,%r20,%r20 + comclr,<<= %r19,%r4,0 + addi 1,%r20,%r20 + comb,<<= %r20,%r3,L$0074 + sub %r4,%r19,%r4 + addl %r3,%r5,%r3 + ldo -1(%r29),%r29 +L$0074 + addib,= -1,%r6,L$0064 + sub %r3,%r20,%r3 + zdep %r29,15,16,%r7 + shd %r3,%r4,16,%r3 + bl L$0063,0 + zdep %r4,15,16,%r4 +L$0064 + or %r7,%r29,%r28 +L$0076 + ldw -148(0,%r30),%r2 + ldw -124(0,%r30),%r7 + ldw -120(0,%r30),%r6 + ldw -116(0,%r30),%r5 + ldw -112(0,%r30),%r4 + ldw -108(0,%r30),%r3 + bv 0(%r2) + ldwm -128(0,%r30),%r8 + .EXIT + .PROCEND diff --git a/crypto/bn/asm/pa-risc2.s b/crypto/bn/asm/pa-risc2.s new file mode 100644 index 0000000000..5e07b7d2e8 --- /dev/null +++ b/crypto/bn/asm/pa-risc2.s @@ -0,0 +1,416 @@ + .SPACE $PRIVATE$ + .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31 + .SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82 + .SPACE $TEXT$ + .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44 + .SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY + .IMPORT $global$,DATA + .IMPORT $$dyncall,MILLICODE +; gcc_compiled.: + .SPACE $TEXT$ + .SUBSPA $CODE$ + + .align 4 + .EXPORT bn_mul_add_word,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR +bn_mul_add_word + .PROC + .CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4 + .ENTRY + stw %r2,-20(0,%r30) + stwm %r4,64(0,%r30) + copy %r24,%r31 + stw %r3,-60(0,%r30) + ldi 0,%r20 + ldo 12(%r26),%r2 + stw %r23,-16(0,%r30) + copy %r25,%r3 + ldo 12(%r3),%r1 + fldws -16(0,%r30),%fr8L +L$0010 + copy %r20,%r25 + ldi 0,%r24 + fldws 0(0,%r3),%fr9L + ldw 0(0,%r26),%r19 + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r19,%r23 + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + ldi 0,%r22 + add %r23,%r29,%r29 + addc %r22,%r28,%r28 + add %r25,%r29,%r29 + addc %r24,%r28,%r28 + copy %r28,%r21 + ldi 0,%r20 + copy %r21,%r20 + addib,= -1,%r31,L$0011 + stw %r29,0(0,%r26) + copy %r20,%r25 + ldi 0,%r24 + fldws -8(0,%r1),%fr9L + ldw -8(0,%r2),%r19 + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r19,%r23 + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + ldi 0,%r22 + add %r23,%r29,%r29 + addc %r22,%r28,%r28 + add %r25,%r29,%r29 + addc %r24,%r28,%r28 + copy %r28,%r21 + ldi 0,%r20 + copy %r21,%r20 + addib,= -1,%r31,L$0011 + stw %r29,-8(0,%r2) + copy %r20,%r25 + ldi 0,%r24 + fldws -4(0,%r1),%fr9L + ldw -4(0,%r2),%r19 + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r19,%r23 + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + ldi 0,%r22 + add %r23,%r29,%r29 + addc %r22,%r28,%r28 + add %r25,%r29,%r29 + addc %r24,%r28,%r28 + copy %r28,%r21 + ldi 0,%r20 + copy %r21,%r20 + addib,= -1,%r31,L$0011 + stw %r29,-4(0,%r2) + copy %r20,%r25 + ldi 0,%r24 + fldws 0(0,%r1),%fr9L + ldw 0(0,%r2),%r19 + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r19,%r23 + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + ldi 0,%r22 + add %r23,%r29,%r29 + addc %r22,%r28,%r28 + add %r25,%r29,%r29 + addc %r24,%r28,%r28 + copy %r28,%r21 + ldi 0,%r20 + copy %r21,%r20 + addib,= -1,%r31,L$0011 + stw %r29,0(0,%r2) + ldo 16(%r1),%r1 + ldo 16(%r3),%r3 + ldo 16(%r2),%r2 + bl L$0010,0 + ldo 16(%r26),%r26 +L$0011 + copy %r20,%r28 + ldw -84(0,%r30),%r2 + ldw -60(0,%r30),%r3 + bv 0(%r2) + ldwm -64(0,%r30),%r4 + .EXIT + .PROCEND + .align 4 + .EXPORT bn_mul_word,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR +bn_mul_word + .PROC + .CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3 + .ENTRY + stw %r2,-20(0,%r30) + copy %r25,%r2 + stwm %r4,64(0,%r30) + copy %r24,%r19 + ldi 0,%r28 + stw %r23,-16(0,%r30) + ldo 12(%r26),%r31 + ldo 12(%r2),%r29 + fldws -16(0,%r30),%fr8L +L$0026 + fldws 0(0,%r2),%fr9L + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r28,%r21 + ldi 0,%r20 + ldw -16(0,%r30),%r24 + ldw -12(0,%r30),%r25 + add %r21,%r25,%r25 + addc %r20,%r24,%r24 + copy %r24,%r23 + ldi 0,%r22 + copy %r23,%r28 + addib,= -1,%r19,L$0027 + stw %r25,0(0,%r26) + fldws -8(0,%r29),%fr9L + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r28,%r21 + ldi 0,%r20 + ldw -16(0,%r30),%r24 + ldw -12(0,%r30),%r25 + add %r21,%r25,%r25 + addc %r20,%r24,%r24 + copy %r24,%r23 + ldi 0,%r22 + copy %r23,%r28 + addib,= -1,%r19,L$0027 + stw %r25,-8(0,%r31) + fldws -4(0,%r29),%fr9L + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r28,%r21 + ldi 0,%r20 + ldw -16(0,%r30),%r24 + ldw -12(0,%r30),%r25 + add %r21,%r25,%r25 + addc %r20,%r24,%r24 + copy %r24,%r23 + ldi 0,%r22 + copy %r23,%r28 + addib,= -1,%r19,L$0027 + stw %r25,-4(0,%r31) + fldws 0(0,%r29),%fr9L + xmpyu %fr8L,%fr9L,%fr9 + fstds %fr9,-16(0,%r30) + copy %r28,%r21 + ldi 0,%r20 + ldw -16(0,%r30),%r24 + ldw -12(0,%r30),%r25 + add %r21,%r25,%r25 + addc %r20,%r24,%r24 + copy %r24,%r23 + ldi 0,%r22 + copy %r23,%r28 + addib,= -1,%r19,L$0027 + stw %r25,0(0,%r31) + ldo 16(%r29),%r29 + ldo 16(%r2),%r2 + ldo 16(%r31),%r31 + bl L$0026,0 + ldo 16(%r26),%r26 +L$0027 + ldw -84(0,%r30),%r2 + bv 0(%r2) + ldwm -64(0,%r30),%r4 + .EXIT + .PROCEND + .align 4 + .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR +bn_sqr_words + .PROC + .CALLINFO FRAME=0,NO_CALLS + .ENTRY + ldo 28(%r26),%r19 + ldo 12(%r25),%r28 +L$0042 + fldws 0(0,%r25),%fr8L + fldws 0(0,%r25),%fr8R + xmpyu %fr8L,%fr8R,%fr8 + fstds %fr8,-16(0,%r30) + ldw -16(0,%r30),%r22 + ldw -12(0,%r30),%r23 + stw %r23,0(0,%r26) + copy %r22,%r21 + ldi 0,%r20 + addib,= -1,%r24,L$0049 + stw %r21,-24(0,%r19) + fldws -8(0,%r28),%fr8L + fldws -8(0,%r28),%fr8R + xmpyu %fr8L,%fr8R,%fr8 + fstds %fr8,-16(0,%r30) + ldw -16(0,%r30),%r22 + ldw -12(0,%r30),%r23 + stw %r23,-20(0,%r19) + copy %r22,%r21 + ldi 0,%r20 + addib,= -1,%r24,L$0049 + stw %r21,-16(0,%r19) + fldws -4(0,%r28),%fr8L + fldws -4(0,%r28),%fr8R + xmpyu %fr8L,%fr8R,%fr8 + fstds %fr8,-16(0,%r30) + ldw -16(0,%r30),%r22 + ldw -12(0,%r30),%r23 + stw %r23,-12(0,%r19) + copy %r22,%r21 + ldi 0,%r20 + addib,= -1,%r24,L$0049 + stw %r21,-8(0,%r19) + fldws 0(0,%r28),%fr8L + fldws 0(0,%r28),%fr8R + xmpyu %fr8L,%fr8R,%fr8 + fstds %fr8,-16(0,%r30) + ldw -16(0,%r30),%r22 + ldw -12(0,%r30),%r23 + stw %r23,-4(0,%r19) + copy %r22,%r21 + ldi 0,%r20 + addib,= -1,%r24,L$0049 + stw %r21,0(0,%r19) + ldo 16(%r28),%r28 + ldo 16(%r25),%r25 + ldo 32(%r19),%r19 + bl L$0042,0 + ldo 32(%r26),%r26 +L$0049 + bv,n 0(%r2) + .EXIT + .PROCEND + .IMPORT BN_num_bits_word,CODE + .IMPORT fprintf,CODE + .IMPORT __iob,DATA + .SPACE $TEXT$ + .SUBSPA $LIT$ + + .align 4 +L$C0000 + .STRING "Division would overflow (%d)\x0a\x00" + .IMPORT abort,CODE + .SPACE $TEXT$ + .SUBSPA $CODE$ + + .align 4 + .EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR +bn_div64 + .PROC + .CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8 + .ENTRY + stw %r2,-20(0,%r30) + stwm %r8,128(0,%r30) + stw %r7,-124(0,%r30) + stw %r4,-112(0,%r30) + stw %r3,-108(0,%r30) + copy %r26,%r3 + copy %r25,%r4 + stw %r6,-120(0,%r30) + ldi 0,%r7 + stw %r5,-116(0,%r30) + movb,<> %r24,%r5,L$0051 + ldi 2,%r6 + bl L$0068,0 + ldi -1,%r28 +L$0051 + .CALL ARGW0=GR + bl BN_num_bits_word,%r2 + copy %r5,%r26 + copy %r28,%r24 + ldi 32,%r19 + comb,= %r19,%r24,L$0052 + subi 31,%r24,%r19 + mtsar %r19 + zvdepi 1,32,%r19 + comb,>>= %r19,%r3,L$0052 + addil LR'__iob-$global$+32,%r27 + ldo RR'__iob-$global$+32(%r1),%r26 + ldil LR'L$C0000,%r25 + .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR + bl fprintf,%r2 + ldo RR'L$C0000(%r25),%r25 + .CALL + bl abort,%r2 + nop +L$0052 + comb,>> %r5,%r3,L$0053 + subi 32,%r24,%r24 + sub %r3,%r5,%r3 +L$0053 + comib,= 0,%r24,L$0054 + subi 31,%r24,%r19 + mtsar %r19 + zvdep %r5,32,%r5 + zvdep %r3,32,%r21 + subi 32,%r24,%r20 + mtsar %r20 + vshd 0,%r4,%r20 + or %r21,%r20,%r3 + mtsar %r19 + zvdep %r4,32,%r4 +L$0054 + extru %r5,15,16,%r23 + extru %r5,31,16,%r28 +L$0055 + extru %r3,15,16,%r19 + comb,<> %r23,%r19,L$0058 + copy %r3,%r26 + bl L$0059,0 + zdepi -1,31,16,%r29 +L$0058 + .IMPORT $$divU,MILLICODE + bl $$divU,%r31 + copy %r23,%r25 +L$0059 + stw %r29,-16(0,%r30) + fldws -16(0,%r30),%fr10L + stw %r28,-16(0,%r30) + fldws -16(0,%r30),%fr10R + stw %r23,-16(0,%r30) + xmpyu %fr10L,%fr10R,%fr8 + fldws -16(0,%r30),%fr10R + fstws %fr8R,-16(0,%r30) + xmpyu %fr10L,%fr10R,%fr9 + ldw -16(0,%r30),%r8 + fstws %fr9R,-16(0,%r30) + copy %r8,%r22 + ldw -16(0,%r30),%r8 + extru %r4,15,16,%r24 + copy %r8,%r21 +L$0060 + sub %r3,%r21,%r20 + copy %r20,%r19 + depi 0,31,16,%r19 + comib,<> 0,%r19,L$0061 + zdep %r20,15,16,%r19 + addl %r19,%r24,%r19 + comb,>>= %r19,%r22,L$0061 + sub %r22,%r28,%r22 + sub %r21,%r23,%r21 + bl L$0060,0 + ldo -1(%r29),%r29 +L$0061 + stw %r29,-16(0,%r30) + fldws -16(0,%r30),%fr10L + stw %r28,-16(0,%r30) + fldws -16(0,%r30),%fr10R + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + ldw -16(0,%r30),%r8 + stw %r23,-16(0,%r30) + fldws -16(0,%r30),%fr10R + copy %r8,%r19 + xmpyu %fr10L,%fr10R,%fr8 + fstws %fr8R,-16(0,%r30) + extru %r19,15,16,%r20 + ldw -16(0,%r30),%r8 + zdep %r19,15,16,%r19 + addl %r8,%r20,%r20 + comclr,<<= %r19,%r4,0 + addi 1,%r20,%r20 + comb,<<= %r20,%r3,L$0066 + sub %r4,%r19,%r4 + addl %r3,%r5,%r3 + ldo -1(%r29),%r29 +L$0066 + addib,= -1,%r6,L$0056 + sub %r3,%r20,%r3 + zdep %r29,15,16,%r7 + shd %r3,%r4,16,%r3 + bl L$0055,0 + zdep %r4,15,16,%r4 +L$0056 + or %r7,%r29,%r28 +L$0068 + ldw -148(0,%r30),%r2 + ldw -124(0,%r30),%r7 + ldw -120(0,%r30),%r6 + ldw -116(0,%r30),%r5 + ldw -112(0,%r30),%r4 + ldw -108(0,%r30),%r3 + bv 0(%r2) + ldwm -128(0,%r30),%r8 + .EXIT + .PROCEND diff --git a/crypto/bn/asm/r3000.s b/crypto/bn/asm/r3000.s new file mode 100644 index 0000000000..5be2a0d0e6 --- /dev/null +++ b/crypto/bn/asm/r3000.s @@ -0,0 +1,646 @@ + .file 1 "../bn_mulw.c" + .set nobopt + .option pic2 + + # GNU C 2.6.3 [AL 1.1, MM 40] SGI running IRIX 5.0 compiled by GNU C + + # Cc1 defaults: + # -mabicalls + + # Cc1 arguments (-G value = 0, Cpu = 3000, ISA = 1): + # -quiet -dumpbase -O2 -o + +gcc2_compiled.: +__gnu_compiled_c: + .rdata + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x34,0x39,0x20 + .byte 0x24,0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x33,0x34,0x20 + .byte 0x24,0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x35,0x20,0x24 + .byte 0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x38,0x20,0x24 + .byte 0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x32,0x33,0x20 + .byte 0x24,0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x37,0x38,0x20 + .byte 0x24,0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x33,0x2e,0x37,0x30,0x20 + .byte 0x24,0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x32,0x20,0x24 + .byte 0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x34,0x20,0x24 + .byte 0x0 + + .byte 0x24,0x52,0x65,0x76,0x69,0x73,0x69,0x6f + .byte 0x6e,0x3a,0x20,0x31,0x2e,0x38,0x20,0x24 + .byte 0x0 + .text + .align 2 + .globl bn_mul_add_word + .ent bn_mul_add_word +bn_mul_add_word: + .frame $sp,0,$31 # vars= 0, regs= 0/0, args= 0, extra= 0 + .mask 0x00000000,0 + .fmask 0x00000000,0 + .set noreorder + .cpload $25 + .set reorder + move $12,$4 + move $14,$5 + move $9,$6 + move $13,$7 + move $8,$0 + addu $10,$12,12 + addu $11,$14,12 +$L2: + lw $6,0($14) + #nop + multu $13,$6 + mfhi $6 + mflo $7 + #nop + move $5,$8 + move $4,$0 + lw $3,0($12) + addu $9,$9,-1 + move $2,$0 + addu $7,$7,$3 + sltu $8,$7,$3 + addu $6,$6,$2 + addu $6,$6,$8 + addu $7,$7,$5 + sltu $2,$7,$5 + addu $6,$6,$4 + addu $6,$6,$2 + srl $3,$6,0 + move $2,$0 + move $8,$3 + .set noreorder + .set nomacro + beq $9,$0,$L3 + sw $7,0($12) + .set macro + .set reorder + + lw $6,-8($11) + #nop + multu $13,$6 + mfhi $6 + mflo $7 + #nop + move $5,$8 + move $4,$0 + lw $3,-8($10) + addu $9,$9,-1 + move $2,$0 + addu $7,$7,$3 + sltu $8,$7,$3 + addu $6,$6,$2 + addu $6,$6,$8 + addu $7,$7,$5 + sltu $2,$7,$5 + addu $6,$6,$4 + addu $6,$6,$2 + srl $3,$6,0 + move $2,$0 + move $8,$3 + .set noreorder + .set nomacro + beq $9,$0,$L3 + sw $7,-8($10) + .set macro + .set reorder + + lw $6,-4($11) + #nop + multu $13,$6 + mfhi $6 + mflo $7 + #nop + move $5,$8 + move $4,$0 + lw $3,-4($10) + addu $9,$9,-1 + move $2,$0 + addu $7,$7,$3 + sltu $8,$7,$3 + addu $6,$6,$2 + addu $6,$6,$8 + addu $7,$7,$5 + sltu $2,$7,$5 + addu $6,$6,$4 + addu $6,$6,$2 + srl $3,$6,0 + move $2,$0 + move $8,$3 + .set noreorder + .set nomacro + beq $9,$0,$L3 + sw $7,-4($10) + .set macro + .set reorder + + lw $6,0($11) + #nop + multu $13,$6 + mfhi $6 + mflo $7 + #nop + move $5,$8 + move $4,$0 + lw $3,0($10) + addu $9,$9,-1 + move $2,$0 + addu $7,$7,$3 + sltu $8,$7,$3 + addu $6,$6,$2 + addu $6,$6,$8 + addu $7,$7,$5 + sltu $2,$7,$5 + addu $6,$6,$4 + addu $6,$6,$2 + srl $3,$6,0 + move $2,$0 + move $8,$3 + .set noreorder + .set nomacro + beq $9,$0,$L3 + sw $7,0($10) + .set macro + .set reorder + + addu $11,$11,16 + addu $14,$14,16 + addu $10,$10,16 + .set noreorder + .set nomacro + j $L2 + addu $12,$12,16 + .set macro + .set reorder + +$L3: + .set noreorder + .set nomacro + j $31 + move $2,$8 + .set macro + .set reorder + + .end bn_mul_add_word + .align 2 + .globl bn_mul_word + .ent bn_mul_word +bn_mul_word: + .frame $sp,0,$31 # vars= 0, regs= 0/0, args= 0, extra= 0 + .mask 0x00000000,0 + .fmask 0x00000000,0 + .set noreorder + .cpload $25 + .set reorder + move $11,$4 + move $12,$5 + move $8,$6 + move $6,$0 + addu $10,$11,12 + addu $9,$12,12 +$L10: + lw $4,0($12) + #nop + multu $7,$4 + mfhi $4 + mflo $5 + #nop + move $3,$6 + move $2,$0 + addu $8,$8,-1 + addu $5,$5,$3 + sltu $6,$5,$3 + addu $4,$4,$2 + addu $4,$4,$6 + srl $3,$4,0 + move $2,$0 + move $6,$3 + .set noreorder + .set nomacro + beq $8,$0,$L11 + sw $5,0($11) + .set macro + .set reorder + + lw $4,-8($9) + #nop + multu $7,$4 + mfhi $4 + mflo $5 + #nop + move $3,$6 + move $2,$0 + addu $8,$8,-1 + addu $5,$5,$3 + sltu $6,$5,$3 + addu $4,$4,$2 + addu $4,$4,$6 + srl $3,$4,0 + move $2,$0 + move $6,$3 + .set noreorder + .set nomacro + beq $8,$0,$L11 + sw $5,-8($10) + .set macro + .set reorder + + lw $4,-4($9) + #nop + multu $7,$4 + mfhi $4 + mflo $5 + #nop + move $3,$6 + move $2,$0 + addu $8,$8,-1 + addu $5,$5,$3 + sltu $6,$5,$3 + addu $4,$4,$2 + addu $4,$4,$6 + srl $3,$4,0 + move $2,$0 + move $6,$3 + .set noreorder + .set nomacro + beq $8,$0,$L11 + sw $5,-4($10) + .set macro + .set reorder + + lw $4,0($9) + #nop + multu $7,$4 + mfhi $4 + mflo $5 + #nop + move $3,$6 + move $2,$0 + addu $8,$8,-1 + addu $5,$5,$3 + sltu $6,$5,$3 + addu $4,$4,$2 + addu $4,$4,$6 + srl $3,$4,0 + move $2,$0 + move $6,$3 + .set noreorder + .set nomacro + beq $8,$0,$L11 + sw $5,0($10) + .set macro + .set reorder + + addu $9,$9,16 + addu $12,$12,16 + addu $10,$10,16 + .set noreorder + .set nomacro + j $L10 + addu $11,$11,16 + .set macro + .set reorder + +$L11: + .set noreorder + .set nomacro + j $31 + move $2,$6 + .set macro + .set reorder + + .end bn_mul_word + .align 2 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: + .frame $sp,0,$31 # vars= 0, regs= 0/0, args= 0, extra= 0 + .mask 0x00000000,0 + .fmask 0x00000000,0 + .set noreorder + .cpload $25 + .set reorder + move $9,$4 + addu $7,$9,28 + addu $8,$5,12 +$L18: + lw $2,0($5) + #nop + multu $2,$2 + mfhi $2 + mflo $3 + #nop + addu $6,$6,-1 + sw $3,0($9) + srl $3,$2,0 + move $2,$0 + .set noreorder + .set nomacro + beq $6,$0,$L19 + sw $3,-24($7) + .set macro + .set reorder + + lw $2,-8($8) + #nop + multu $2,$2 + mfhi $2 + mflo $3 + #nop + addu $6,$6,-1 + sw $3,-20($7) + srl $3,$2,0 + move $2,$0 + .set noreorder + .set nomacro + beq $6,$0,$L19 + sw $3,-16($7) + .set macro + .set reorder + + lw $2,-4($8) + #nop + multu $2,$2 + mfhi $2 + mflo $3 + #nop + addu $6,$6,-1 + sw $3,-12($7) + srl $3,$2,0 + move $2,$0 + .set noreorder + .set nomacro + beq $6,$0,$L19 + sw $3,-8($7) + .set macro + .set reorder + + lw $2,0($8) + #nop + multu $2,$2 + mfhi $2 + mflo $3 + #nop + addu $6,$6,-1 + sw $3,-4($7) + srl $3,$2,0 + move $2,$0 + .set noreorder + .set nomacro + beq $6,$0,$L19 + sw $3,0($7) + .set macro + .set reorder + + addu $8,$8,16 + addu $5,$5,16 + addu $7,$7,32 + .set noreorder + .set nomacro + j $L18 + addu $9,$9,32 + .set macro + .set reorder + +$L19: + j $31 + .end bn_sqr_words + .rdata + .align 2 +$LC0: + + .byte 0x44,0x69,0x76,0x69,0x73,0x69,0x6f,0x6e + .byte 0x20,0x77,0x6f,0x75,0x6c,0x64,0x20,0x6f + .byte 0x76,0x65,0x72,0x66,0x6c,0x6f,0x77,0xa + .byte 0x0 + .text + .align 2 + .globl bn_div64 + .ent bn_div64 +bn_div64: + .frame $sp,56,$31 # vars= 0, regs= 7/0, args= 16, extra= 8 + .mask 0x901f0000,-8 + .fmask 0x00000000,0 + .set noreorder + .cpload $25 + .set reorder + subu $sp,$sp,56 + .cprestore 16 + sw $16,24($sp) + move $16,$4 + sw $17,28($sp) + move $17,$5 + sw $18,32($sp) + move $18,$6 + sw $20,40($sp) + move $20,$0 + sw $19,36($sp) + li $19,0x00000002 # 2 + sw $31,48($sp) + .set noreorder + .set nomacro + bne $18,$0,$L26 + sw $28,44($sp) + .set macro + .set reorder + + .set noreorder + .set nomacro + j $L43 + li $2,-1 # 0xffffffff + .set macro + .set reorder + +$L26: + move $4,$18 + jal BN_num_bits_word + move $4,$2 + li $2,0x00000020 # 32 + .set noreorder + .set nomacro + beq $4,$2,$L27 + li $2,0x00000001 # 1 + .set macro + .set reorder + + sll $2,$2,$4 + sltu $2,$2,$16 + .set noreorder + .set nomacro + beq $2,$0,$L44 + li $5,0x00000020 # 32 + .set macro + .set reorder + + la $4,__iob+32 + la $5,$LC0 + jal fprintf + jal abort +$L27: + li $5,0x00000020 # 32 +$L44: + sltu $2,$16,$18 + .set noreorder + .set nomacro + bne $2,$0,$L28 + subu $4,$5,$4 + .set macro + .set reorder + + subu $16,$16,$18 +$L28: + .set noreorder + .set nomacro + beq $4,$0,$L29 + li $10,-65536 # 0xffff0000 + .set macro + .set reorder + + sll $18,$18,$4 + sll $3,$16,$4 + subu $2,$5,$4 + srl $2,$17,$2 + or $16,$3,$2 + sll $17,$17,$4 +$L29: + srl $7,$18,16 + andi $9,$18,0xffff +$L30: + srl $2,$16,16 + .set noreorder + .set nomacro + beq $2,$7,$L34 + li $6,0x0000ffff # 65535 + .set macro + .set reorder + + divu $6,$16,$7 +$L34: + mult $6,$9 + mflo $5 + #nop + #nop + mult $6,$7 + and $2,$17,$10 + srl $8,$2,16 + mflo $4 +$L35: + subu $3,$16,$4 + and $2,$3,$10 + .set noreorder + .set nomacro + bne $2,$0,$L36 + sll $2,$3,16 + .set macro + .set reorder + + addu $2,$2,$8 + sltu $2,$2,$5 + .set noreorder + .set nomacro + beq $2,$0,$L36 + subu $5,$5,$9 + .set macro + .set reorder + + subu $4,$4,$7 + .set noreorder + .set nomacro + j $L35 + addu $6,$6,-1 + .set macro + .set reorder + +$L36: + mult $6,$7 + mflo $5 + #nop + #nop + mult $6,$9 + mflo $4 + #nop + #nop + srl $3,$4,16 + sll $2,$4,16 + and $4,$2,$10 + sltu $2,$17,$4 + .set noreorder + .set nomacro + beq $2,$0,$L40 + addu $5,$5,$3 + .set macro + .set reorder + + addu $5,$5,1 +$L40: + sltu $2,$16,$5 + .set noreorder + .set nomacro + beq $2,$0,$L41 + subu $17,$17,$4 + .set macro + .set reorder + + addu $16,$16,$18 + addu $6,$6,-1 +$L41: + addu $19,$19,-1 + .set noreorder + .set nomacro + beq $19,$0,$L31 + subu $16,$16,$5 + .set macro + .set reorder + + sll $20,$6,16 + sll $3,$16,16 + srl $2,$17,16 + or $16,$3,$2 + .set noreorder + .set nomacro + j $L30 + sll $17,$17,16 + .set macro + .set reorder + +$L31: + or $2,$20,$6 +$L43: + lw $31,48($sp) + lw $20,40($sp) + lw $19,36($sp) + lw $18,32($sp) + lw $17,28($sp) + lw $16,24($sp) + addu $sp,$sp,56 + j $31 + .end bn_div64 + + .globl abort .text + .globl fprintf .text + .globl BN_num_bits_word .text diff --git a/crypto/bn/asm/sparc.s b/crypto/bn/asm/sparc.s new file mode 100644 index 0000000000..37c5fb194e --- /dev/null +++ b/crypto/bn/asm/sparc.s @@ -0,0 +1,359 @@ + .file "bn_mulw.c" +gcc2_compiled.: +.section ".text" + .align 4 + .global bn_mul_add_word + .type bn_mul_add_word,#function + .proc 016 +bn_mul_add_word: + !#PROLOGUE# 0 + save %sp,-112,%sp + !#PROLOGUE# 1 + mov %i0,%o0 + mov %i1,%o2 + mov %i2,%g1 + mov %i3,%o1 + mov 0,%i4 + add %o0,12,%g4 + add %o2,12,%o7 +.LL2: + mov %i4,%i3 + mov 0,%i2 + ld [%o0],%g2 + mov %g2,%i1 + ld [%o2],%g2 + mov 0,%i0 + umul %o1,%g2,%g3 + rd %y,%g2 + addcc %g3,%i1,%g3 + addx %g2,%i0,%g2 + addcc %g3,%i3,%g3 + addx %g2,%i2,%g2 + st %g3,[%o0] + mov %g2,%i5 + mov 0,%i4 + addcc %g1,-1,%g1 + be .LL3 + mov %i5,%i4 + mov %i4,%i3 + mov 0,%i2 + ld [%g4-8],%g2 + mov %g2,%i1 + ld [%o7-8],%g2 + mov 0,%i0 + umul %o1,%g2,%g3 + rd %y,%g2 + addcc %g3,%i1,%g3 + addx %g2,%i0,%g2 + addcc %g3,%i3,%g3 + addx %g2,%i2,%g2 + st %g3,[%g4-8] + mov %g2,%i5 + mov 0,%i4 + addcc %g1,-1,%g1 + be .LL3 + mov %i5,%i4 + mov %i4,%i3 + mov 0,%i2 + ld [%g4-4],%g2 + mov %g2,%i1 + ld [%o7-4],%g2 + mov 0,%i0 + umul %o1,%g2,%g3 + rd %y,%g2 + addcc %g3,%i1,%g3 + addx %g2,%i0,%g2 + addcc %g3,%i3,%g3 + addx %g2,%i2,%g2 + st %g3,[%g4-4] + mov %g2,%i5 + mov 0,%i4 + addcc %g1,-1,%g1 + be .LL3 + mov %i5,%i4 + mov %i4,%i3 + mov 0,%i2 + ld [%g4],%g2 + mov %g2,%i1 + ld [%o7],%g2 + mov 0,%i0 + umul %o1,%g2,%g3 + rd %y,%g2 + addcc %g3,%i1,%g3 + addx %g2,%i0,%g2 + addcc %g3,%i3,%g3 + addx %g2,%i2,%g2 + st %g3,[%g4] + mov %g2,%i5 + mov 0,%i4 + addcc %g1,-1,%g1 + be .LL3 + mov %i5,%i4 + add %o7,16,%o7 + add %o2,16,%o2 + add %g4,16,%g4 + b .LL2 + add %o0,16,%o0 +.LL3: + ret + restore %g0,%i4,%o0 +.LLfe1: + .size bn_mul_add_word,.LLfe1-bn_mul_add_word + .align 4 + .global bn_mul_word + .type bn_mul_word,#function + .proc 016 +bn_mul_word: + !#PROLOGUE# 0 + save %sp,-112,%sp + !#PROLOGUE# 1 + mov %i0,%o7 + mov %i1,%o0 + mov %i2,%i4 + mov %i3,%g4 + mov 0,%i0 + add %o7,12,%g1 + add %o0,12,%i5 +.LL18: + mov %i0,%g3 + mov 0,%g2 + ld [%o0],%i2 + umul %g4,%i2,%i3 + rd %y,%i2 + addcc %i3,%g3,%i3 + addx %i2,%g2,%i2 + st %i3,[%o7] + mov %i2,%i1 + mov 0,%i0 + addcc %i4,-1,%i4 + be .LL19 + mov %i1,%i0 + mov %i0,%g3 + mov 0,%g2 + ld [%i5-8],%i2 + umul %g4,%i2,%i3 + rd %y,%i2 + addcc %i3,%g3,%i3 + addx %i2,%g2,%i2 + st %i3,[%g1-8] + mov %i2,%i1 + mov 0,%i0 + addcc %i4,-1,%i4 + be .LL19 + mov %i1,%i0 + mov %i0,%g3 + mov 0,%g2 + ld [%i5-4],%i2 + umul %g4,%i2,%i3 + rd %y,%i2 + addcc %i3,%g3,%i3 + addx %i2,%g2,%i2 + st %i3,[%g1-4] + mov %i2,%i1 + mov 0,%i0 + addcc %i4,-1,%i4 + be .LL19 + mov %i1,%i0 + mov %i0,%g3 + mov 0,%g2 + ld [%i5],%i2 + umul %g4,%i2,%i3 + rd %y,%i2 + addcc %i3,%g3,%i3 + addx %i2,%g2,%i2 + st %i3,[%g1] + mov %i2,%i1 + mov 0,%i0 + addcc %i4,-1,%i4 + be .LL19 + mov %i1,%i0 + add %i5,16,%i5 + add %o0,16,%o0 + add %g1,16,%g1 + b .LL18 + add %o7,16,%o7 +.LL19: + ret + restore +.LLfe2: + .size bn_mul_word,.LLfe2-bn_mul_word + .align 4 + .global bn_sqr_words + .type bn_sqr_words,#function + .proc 020 +bn_sqr_words: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + mov %o0,%g4 + add %g4,28,%o3 + add %o1,12,%g1 +.LL34: + ld [%o1],%o0 + addcc %o2,-1,%o2 + umul %o0,%o0,%o5 + rd %y,%o4 + st %o5,[%g4] + mov %o4,%g3 + mov 0,%g2 + be .LL35 + st %g3,[%o3-24] + ld [%g1-8],%o0 + addcc %o2,-1,%o2 + umul %o0,%o0,%o5 + rd %y,%o4 + st %o5,[%o3-20] + mov %o4,%g3 + mov 0,%g2 + be .LL35 + st %g3,[%o3-16] + ld [%g1-4],%o0 + addcc %o2,-1,%o2 + umul %o0,%o0,%o5 + rd %y,%o4 + st %o5,[%o3-12] + mov %o4,%g3 + mov 0,%g2 + be .LL35 + st %g3,[%o3-8] + ld [%g1],%o0 + addcc %o2,-1,%o2 + umul %o0,%o0,%o5 + rd %y,%o4 + st %o5,[%o3-4] + mov %o4,%g3 + mov 0,%g2 + be .LL35 + st %g3,[%o3] + add %g1,16,%g1 + add %o1,16,%o1 + add %o3,32,%o3 + b .LL34 + add %g4,32,%g4 +.LL35: + retl + nop +.LLfe3: + .size bn_sqr_words,.LLfe3-bn_sqr_words +.section ".rodata" + .align 8 +.LLC0: + .asciz "Division would overflow\n" +.section ".text" + .align 4 + .global bn_div64 + .type bn_div64,#function + .proc 016 +bn_div64: + !#PROLOGUE# 0 + save %sp,-112,%sp + !#PROLOGUE# 1 + mov 0,%l1 + cmp %i2,0 + bne .LL42 + mov 2,%l0 + b .LL59 + mov -1,%i0 +.LL42: + call BN_num_bits_word,0 + mov %i2,%o0 + mov %o0,%o2 + cmp %o2,32 + be .LL43 + mov 1,%o0 + sll %o0,%o2,%o0 + cmp %i0,%o0 + bleu .LL60 + mov 32,%o0 + sethi %hi(__iob+32),%o0 + or %o0,%lo(__iob+32),%o0 + sethi %hi(.LLC0),%o1 + call fprintf,0 + or %o1,%lo(.LLC0),%o1 + call abort,0 + nop +.LL43: + mov 32,%o0 +.LL60: + cmp %i0,%i2 + blu .LL44 + sub %o0,%o2,%o2 + sub %i0,%i2,%i0 +.LL44: + cmp %o2,0 + be .LL45 + sethi %hi(-65536),%o7 + sll %i2,%o2,%i2 + sll %i0,%o2,%o1 + sub %o0,%o2,%o0 + srl %i1,%o0,%o0 + or %o1,%o0,%i0 + sll %i1,%o2,%i1 +.LL45: + srl %i2,16,%g2 + sethi %hi(65535),%o0 + or %o0,%lo(65535),%o1 + and %i2,%o1,%g3 + mov %o0,%g4 + mov %o1,%g1 +.LL46: + srl %i0,16,%o0 + cmp %o0,%g2 + be .LL50 + or %g4,%lo(65535),%o3 + wr %g0,%g0,%y + nop + nop + nop + udiv %i0,%g2,%o3 +.LL50: + and %i1,%o7,%o0 + srl %o0,16,%o5 + smul %o3,%g3,%o4 + smul %o3,%g2,%o2 +.LL51: + sub %i0,%o2,%o1 + andcc %o1,%o7,%g0 + bne .LL52 + sll %o1,16,%o0 + add %o0,%o5,%o0 + cmp %o4,%o0 + bleu .LL52 + sub %o4,%g3,%o4 + sub %o2,%g2,%o2 + b .LL51 + add %o3,-1,%o3 +.LL52: + smul %o3,%g2,%o2 + smul %o3,%g3,%o0 + srl %o0,16,%o1 + sll %o0,16,%o0 + and %o0,%o7,%o0 + cmp %i1,%o0 + bgeu .LL56 + add %o2,%o1,%o2 + add %o2,1,%o2 +.LL56: + cmp %i0,%o2 + bgeu .LL57 + sub %i1,%o0,%i1 + add %i0,%i2,%i0 + add %o3,-1,%o3 +.LL57: + addcc %l0,-1,%l0 + be .LL47 + sub %i0,%o2,%i0 + sll %o3,16,%l1 + sll %i0,16,%o0 + srl %i1,16,%o1 + or %o0,%o1,%i0 + and %i1,%g1,%o0 + b .LL46 + sll %o0,16,%i1 +.LL47: + or %l1,%o3,%i0 +.LL59: + ret + restore +.LLfe4: + .size bn_div64,.LLfe4-bn_div64 + .ident "GCC: (GNU) 2.7.0" diff --git a/crypto/bn/asm/x86-bsdi.s b/crypto/bn/asm/x86-bsdi.s new file mode 100644 index 0000000000..ca6687648e --- /dev/null +++ b/crypto/bn/asm/x86-bsdi.s @@ -0,0 +1,272 @@ + .file "bn_mulw.c" + .version "01.01" +gcc2_compiled.: +.text + .align 4 +.globl _bn_mul_add_word +_bn_mul_add_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + # ax L(t) + # dx H(t) + # bx a + # cx w + # di r + # si c + # bp num + xorl %esi,%esi # c=0 + movl 20(%esp),%edi # r => edi + movl 24(%esp),%ebx # a => exb + movl 32(%esp),%ecx # w => ecx + movl 28(%esp),%ebp # num => ebp + + shrl $2,%ebp # num/4 + je .L910 + +# .align 4 +.L110: + # Round 1 + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl (%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+= carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 2 + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl 4(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+= carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 3 + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl 8(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 4 + movl %ecx,%eax # w => eax + mull 12(%ebx) # w * *a + addl 12(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,12(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + addl $16,%ebx # a+=4 (4 words) + addl $16,%edi # r+=4 (4 words) + + decl %ebp # --num + je .L910 + jmp .L110 +# .align 4 +.L910: + movl 28(%esp),%ebp # num => ebp + andl $3,%ebp + je .L111 + + # Round 1 + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl (%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L111 + + # Round 2 + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl 4(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L111 + + # Round 3 + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl 8(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + +# .align 4 +.L111: + movl %esi,%eax # return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe1: + .align 4 +.globl _bn_mul_word +_bn_mul_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + # ax L(t) + # dx H(t) + # bx a + # cx w + # di r + # num bp + # si c + xorl %esi,%esi # c=0 + movl 20(%esp),%edi # r => edi + movl 24(%esp),%ebx # a => exb + movl 28(%esp),%ebp # num => bp + movl 32(%esp),%ecx # w => ecx + +# .align 4 +.L210: + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 12(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,12(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + addl $16,%ebx # a+=4 (4 words) + addl $16,%edi # r+=4 (4 words) + + jmp .L210 +# .align 4 +.L211: + movl %esi,%eax # return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe2: + .align 4 +.globl _bn_sqr_words +_bn_sqr_words: + pushl %edi + pushl %esi + pushl %ebx + movl 16(%esp),%esi # r + movl 20(%esp),%edi # a + movl 24(%esp),%ebx # n +# .align 4 + shrl $2,%ebx + jz .L99 +.L28: + movl (%edi),%eax # get a + mull %eax # a*a + movl %eax,(%esi) # put low into return addr + movl %edx,4(%esi) # put high into return addr + + movl 4(%edi),%eax # get a + mull %eax # a*a + movl %eax,8(%esi) # put low into return addr + movl %edx,12(%esi) # put high into return addr + + movl 8(%edi),%eax # get a + mull %eax # a*a + movl %eax,16(%esi) # put low into return addr + movl %edx,20(%esi) # put high into return addr + + movl 12(%edi),%eax # get a + mull %eax # a*a + movl %eax,24(%esi) # put low into return addr + movl %edx,28(%esi) # put high into return addr + + addl $16,%edi + addl $32,%esi + decl %ebx # n-=4; + jz .L99 + jmp .L28 +# .align 4 +.L99: + movl 24(%esp),%ebx # n + andl $3,%ebx + jz .L29 + movl (%edi),%eax # get a + mull %eax # a*a + movl %eax,(%esi) # put low into return addr + movl %edx,4(%esi) # put high into return addr + decl %ebx # n--; + jz .L29 + movl 4(%edi),%eax # get a + mull %eax # a*a + movl %eax,8(%esi) # put low into return addr + movl %edx,12(%esi) # put high into return addr + decl %ebx # n--; + jz .L29 + movl 8(%edi),%eax # get a + mull %eax # a*a + movl %eax,16(%esi) # put low into return addr + movl %edx,20(%esi) # put high into return addr + +.L29: + popl %ebx + popl %esi + popl %edi + ret +.Lfe3: + .align 4 +.globl _bn_div64 +_bn_div64: + movl 4(%esp),%edx # a + movl 8(%esp),%eax # b + divl 12(%esp) # ab/c + ret +.Lfe4: + .ident "GCC: (GNU) 2.6.3" diff --git a/crypto/bn/asm/x86-lnx.s b/crypto/bn/asm/x86-lnx.s new file mode 100644 index 0000000000..5123867440 --- /dev/null +++ b/crypto/bn/asm/x86-lnx.s @@ -0,0 +1,282 @@ + .file "bn_mulw.c" + .version "01.01" +gcc2_compiled.: +.text + .align 16 +.globl bn_mul_add_word + .type bn_mul_add_word,@function +bn_mul_add_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + # ax L(t) + # dx H(t) + # bx a + # cx w + # di r + # si c + # bp num + xorl %esi,%esi # c=0 + movl 20(%esp),%edi # r => edi + movl 24(%esp),%ebx # a => exb + movl 32(%esp),%ecx # w => ecx + movl 28(%esp),%ebp # num => ebp + + shrl $2,%ebp # num/4 + je .L910 + + .align 4 +.L110: + # Round 1 + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl (%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+= carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 2 + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl 4(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+= carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 3 + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl 8(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 4 + movl %ecx,%eax # w => eax + mull 12(%ebx) # w * *a + addl 12(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,12(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + addl $16,%ebx # a+=4 (4 words) + addl $16,%edi # r+=4 (4 words) + + decl %ebp # --num + je .L910 + jmp .L110 + .align 4 +.L910: + movl 28(%esp),%ebp # num => ebp + andl $3,%ebp + je .L111 + + # Round 1 + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl (%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L111 + + # Round 2 + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl 4(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L111 + + # Round 3 + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl 8(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + .align 4 +.L111: + movl %esi,%eax # return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe1: + .size bn_mul_add_word,.Lfe1-bn_mul_add_word + .align 16 +.globl bn_mul_word + .type bn_mul_word,@function +bn_mul_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + # ax L(t) + # dx H(t) + # bx a + # cx w + # di r + # num bp + # si c + xorl %esi,%esi # c=0 + movl 20(%esp),%edi # r => edi + movl 24(%esp),%ebx # a => exb + movl 28(%esp),%ebp # num => bp + movl 32(%esp),%ecx # w => ecx + + .align 4 +.L210: + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 12(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,12(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + addl $16,%ebx # a+=4 (4 words) + addl $16,%edi # r+=4 (4 words) + + jmp .L210 + .align 16 +.L211: + movl %esi,%eax # return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe2: + .size bn_mul_word,.Lfe2-bn_mul_word + + .align 16 +.globl bn_sqr_words + .type bn_sqr_words,@function +bn_sqr_words: + pushl %edi + pushl %esi + pushl %ebx + movl 16(%esp),%esi # r + movl 20(%esp),%edi # a + movl 24(%esp),%ebx # n + .align 4 + shrl $2,%ebx + jz .L99 +.L28: + movl (%edi),%eax # get a + mull %eax # a*a + movl %eax,(%esi) # put low into return addr + movl %edx,4(%esi) # put high into return addr + + movl 4(%edi),%eax # get a + mull %eax # a*a + movl %eax,8(%esi) # put low into return addr + movl %edx,12(%esi) # put high into return addr + + movl 8(%edi),%eax # get a + mull %eax # a*a + movl %eax,16(%esi) # put low into return addr + movl %edx,20(%esi) # put high into return addr + + movl 12(%edi),%eax # get a + mull %eax # a*a + movl %eax,24(%esi) # put low into return addr + movl %edx,28(%esi) # put high into return addr + + addl $16,%edi + addl $32,%esi + decl %ebx # n-=4; + jz .L99 + jmp .L28 + .align 16 +.L99: + movl 24(%esp),%ebx # n + andl $3,%ebx + jz .L29 + movl (%edi),%eax # get a + mull %eax # a*a + movl %eax,(%esi) # put low into return addr + movl %edx,4(%esi) # put high into return addr + decl %ebx # n--; + jz .L29 + movl 4(%edi),%eax # get a + mull %eax # a*a + movl %eax,8(%esi) # put low into return addr + movl %edx,12(%esi) # put high into return addr + decl %ebx # n--; + jz .L29 + movl 8(%edi),%eax # get a + mull %eax # a*a + movl %eax,16(%esi) # put low into return addr + movl %edx,20(%esi) # put high into return addr + +.L29: + popl %ebx + popl %esi + popl %edi + ret +.Lfe3: + .size bn_sqr_words,.Lfe3-bn_sqr_words + + .align 16 +.globl bn_div64 + .type bn_div64,@function +bn_div64: + movl 4(%esp),%edx # a + movl 8(%esp),%eax # b + divl 12(%esp) # ab/c + ret +.Lfe4: + .size bn_div64,.Lfe4-bn_div64 + .ident "GCC: (GNU) 2.6.3" diff --git a/crypto/bn/asm/x86-lnxa.s b/crypto/bn/asm/x86-lnxa.s new file mode 100644 index 0000000000..74855dc74d --- /dev/null +++ b/crypto/bn/asm/x86-lnxa.s @@ -0,0 +1,282 @@ + .file "bn_mulw.c" + .version "01.01" +gcc2_compiled.: +.text + .align 4 +.globl _bn_mul_add_word + .type _bn_mul_add_word,@function +_bn_mul_add_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + # ax L(t) + # dx H(t) + # bx a + # cx w + # di r + # si c + # bp num + xorl %esi,%esi # c=0 + movl 20(%esp),%edi # r => edi + movl 24(%esp),%ebx # a => exb + movl 32(%esp),%ecx # w => ecx + movl 28(%esp),%ebp # num => ebp + + shrl $2,%ebp # num/4 + je .L910 + +# .align 4 +.L110: + # Round 1 + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl (%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+= carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 2 + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl 4(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+= carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 3 + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl 8(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + # Round 4 + movl %ecx,%eax # w => eax + mull 12(%ebx) # w * *a + addl 12(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,12(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + + addl $16,%ebx # a+=4 (4 words) + addl $16,%edi # r+=4 (4 words) + + decl %ebp # --num + je .L910 + jmp .L110 +# .align 4 +.L910: + movl 28(%esp),%ebp # num => ebp + andl $3,%ebp + je .L111 + + # Round 1 + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl (%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L111 + + # Round 2 + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl 4(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L111 + + # Round 3 + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl 8(%edi),%eax # *r+=L(t) + adcl $0,%edx # H(t)+=carry + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r+=L(t) + movl %edx,%esi # c=H(t) + +# .align 4 +.L111: + movl %esi,%eax # return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe1: + .size _bn_mul_add_word,.Lfe1-_bn_mul_add_word + .align 4 +.globl _bn_mul_word + .type _bn_mul_word,@function +_bn_mul_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + # ax L(t) + # dx H(t) + # bx a + # cx w + # di r + # num bp + # si c + xorl %esi,%esi # c=0 + movl 20(%esp),%edi # r => edi + movl 24(%esp),%ebx # a => exb + movl 28(%esp),%ebp # num => bp + movl 32(%esp),%ecx # w => ecx + +# .align 4 +.L210: + movl %ecx,%eax # w => eax + mull (%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 4(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,4(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 8(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,8(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + movl %ecx,%eax # w => eax + mull 12(%ebx) # w * *a + addl %esi,%eax # L(t)+=c + adcl $0,%edx # H(t)+=carry + movl %eax,12(%edi) # *r=L(t) + movl %edx,%esi # c=H(t) + decl %ebp # --num + je .L211 + + addl $16,%ebx # a+=4 (4 words) + addl $16,%edi # r+=4 (4 words) + + jmp .L210 +# .align 4 +.L211: + movl %esi,%eax # return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe2: + .size _bn_mul_word,.Lfe2-_bn_mul_word + + .align 4 +.globl _bn_sqr_words + .type _bn_sqr_words,@function +_bn_sqr_words: + pushl %edi + pushl %esi + pushl %ebx + movl 16(%esp),%esi # r + movl 20(%esp),%edi # a + movl 24(%esp),%ebx # n +# .align 4 + shrl $2,%ebx + jz .L99 +.L28: + movl (%edi),%eax # get a + mull %eax # a*a + movl %eax,(%esi) # put low into return addr + movl %edx,4(%esi) # put high into return addr + + movl 4(%edi),%eax # get a + mull %eax # a*a + movl %eax,8(%esi) # put low into return addr + movl %edx,12(%esi) # put high into return addr + + movl 8(%edi),%eax # get a + mull %eax # a*a + movl %eax,16(%esi) # put low into return addr + movl %edx,20(%esi) # put high into return addr + + movl 12(%edi),%eax # get a + mull %eax # a*a + movl %eax,24(%esi) # put low into return addr + movl %edx,28(%esi) # put high into return addr + + addl $16,%edi + addl $32,%esi + decl %ebx # n-=4; + jz .L99 + jmp .L28 +# .align 4 +.L99: + movl 24(%esp),%ebx # n + andl $3,%ebx + jz .L29 + movl (%edi),%eax # get a + mull %eax # a*a + movl %eax,(%esi) # put low into return addr + movl %edx,4(%esi) # put high into return addr + decl %ebx # n--; + jz .L29 + movl 4(%edi),%eax # get a + mull %eax # a*a + movl %eax,8(%esi) # put low into return addr + movl %edx,12(%esi) # put high into return addr + decl %ebx # n--; + jz .L29 + movl 8(%edi),%eax # get a + mull %eax # a*a + movl %eax,16(%esi) # put low into return addr + movl %edx,20(%esi) # put high into return addr + +.L29: + popl %ebx + popl %esi + popl %edi + ret +.Lfe3: + .size _bn_sqr_words,.Lfe3-_bn_sqr_words + + .align 4 +.globl _bn_div64 + .type _bn_div64,@function +_bn_div64: + movl 4(%esp),%edx # a + movl 8(%esp),%eax # b + divl 12(%esp) # ab/c + ret +.Lfe4: + .size _bn_div64,.Lfe4-_bn_div64 + .ident "GCC: (GNU) 2.6.3" diff --git a/crypto/bn/asm/x86-sol.s b/crypto/bn/asm/x86-sol.s new file mode 100644 index 0000000000..c961e64fa0 --- /dev/null +++ b/crypto/bn/asm/x86-sol.s @@ -0,0 +1,224 @@ + .file "bn_mulw.c" + .version "01.01" +gcc2_compiled.: +.text + .align 16 +.globl bn_mul_add_word + .type bn_mul_add_word,@function +bn_mul_add_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + / ax L(t) + / dx H(t) + / bx a + / cx w + / di r + / si c + / bp num + xorl %esi,%esi / c=0 + movl 20(%esp),%edi / r => edi + movl 24(%esp),%ebx / a => exb + movl 28(%esp),%ebp / num => ebp + movl 32(%esp),%ecx / w => ecx + + .align 4 +.L110: + movl %ecx,%eax / w => eax + mull (%ebx) / w * *a + addl (%edi),%eax / L(t)+= *r + adcl $0,%edx / H(t)+= carry + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L111 + + movl %ecx,%eax / w => eax + mull 4(%ebx) / w * *a + addl 4(%edi),%eax / L(t)+= *r + adcl $0,%edx / H(t)+= carry + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,4(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L111 + + movl %ecx,%eax / w => eax + mull 8(%ebx) / w * *a + addl 8(%edi),%eax / L(t)+= *r + adcl $0,%edx / H(t)+= carry + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,8(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L111 + + movl %ecx,%eax / w => eax + mull 12(%ebx) / w * *a + addl 12(%edi),%eax / L(t)+= *r + adcl $0,%edx / H(t)+= carry + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,12(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L111 + + addl $16,%ebx / a+=4 (4 words) + addl $16,%edi / r+=4 (4 words) + + jmp .L110 + .align 16 +.L111: + movl %esi,%eax / return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe1: + .size bn_mul_add_word,.Lfe1-bn_mul_add_word + .align 16 +.globl bn_mul_word + .type bn_mul_word,@function +bn_mul_word: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + / ax L(t) + / dx H(t) + / bx a + / cx w + / di r + / num bp + / si c + xorl %esi,%esi / c=0 + movl 20(%esp),%edi / r => edi + movl 24(%esp),%ebx / a => exb + movl 28(%esp),%ebp / num => ebp + movl 32(%esp),%ecx / w => ecx + + .align 4 +.L210: + movl %ecx,%eax / w => eax + mull (%ebx) / w * *a + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L211 + + movl %ecx,%eax / w => eax + mull 4(%ebx) / w * *a + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,4(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L211 + + movl %ecx,%eax / w => eax + mull 8(%ebx) / w * *a + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,8(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L211 + + movl %ecx,%eax / w => eax + mull 12(%ebx) / w * *a + addl %esi,%eax / L(t)+=c + adcl $0,%edx / H(t)+=carry + movl %eax,12(%edi) / *r=L(t) + movl %edx,%esi / c=H(t) + decl %ebp / --num + je .L211 + + addl $16,%ebx / a+=4 (4 words) + addl $16,%edi / r+=4 (4 words) + + jmp .L210 + .align 16 +.L211: + movl %esi,%eax / return(c) + popl %ebx + popl %esi + popl %edi + popl %ebp + ret +.Lfe2: + .size bn_mul_word,.Lfe2-bn_mul_word + + .align 16 +.globl bn_sqr_words + .type bn_sqr_words,@function +bn_sqr_words: + pushl %edi + pushl %esi + pushl %ebx + movl 16(%esp),%esi / r + movl 20(%esp),%edi / a + movl 24(%esp),%ebx / n + .align 4 +.L28: + movl (%edi),%eax / get a + mull %eax / a*a + movl %eax,(%esi) / put low into return addr + movl %edx,4(%esi) / put high into return addr + decl %ebx / n--; + je .L29 + + movl 4(%edi),%eax / get a + mull %eax / a*a + movl %eax,8(%esi) / put low into return addr + movl %edx,12(%esi) / put high into return addr + decl %ebx / n--; + je .L29 + + movl 8(%edi),%eax / get a + mull %eax / a*a + movl %eax,16(%esi) / put low into return addr + movl %edx,20(%esi) / put high into return addr + decl %ebx / n--; + je .L29 + + movl 12(%edi),%eax / get a + mull %eax / a*a + movl %eax,24(%esi) / put low into return addr + movl %edx,28(%esi) / put high into return addr + decl %ebx / n--; + je .L29 + + addl $16,%edi + addl $32,%esi + jmp .L28 + .align 16 +.L29: + popl %ebx + popl %esi + popl %edi + ret +.Lfe3: + .size bn_sqr_words,.Lfe3-bn_sqr_words + + .align 16 +.globl bn_div64 + .type bn_div64,@function +bn_div64: + movl 4(%esp),%edx / a + movl 8(%esp),%eax / b + divl 12(%esp) / ab/c + ret +.Lfe4: + .size bn_div64,.Lfe4-bn_div64 + .ident "GCC: (GNU) 2.6.3" diff --git a/crypto/bn/asm/x86nt32.asm b/crypto/bn/asm/x86nt32.asm new file mode 100644 index 0000000000..0198c2c583 --- /dev/null +++ b/crypto/bn/asm/x86nt32.asm @@ -0,0 +1,288 @@ + TITLE bn_mulw.c + .386P +.model FLAT +PUBLIC _bn_mul_add_word +_TEXT SEGMENT +; File bn_mulw.c +_bn_mul_add_word PROC NEAR + push ebp + push ebx + push esi + push edi + mov edi,DWORD PTR 20[esp] ; r + mov ebx,DWORD PTR 24[esp] ; a + mov ecx,DWORD PTR 32[esp] ; w + xor esi,esi ; c=0 + + mov ebp,DWORD PTR 28[esp] ; num + shr ebp,2 ; num/4 + jz $L666 + +$L546: + ; Round one + mov eax,DWORD PTR [ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR [edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR [edi],eax ; *r+=ax + mov esi,edx ; c = overflow + + ; Round two + mov eax,DWORD PTR 4[ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR 4[edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR 4[edi],eax ; *r+=ax + mov esi,edx ; c = overflow + + ; Round three + mov eax,DWORD PTR 8[ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR 8[edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR 8[edi],eax ; *r+=ax + mov esi,edx ; c = overflow + + ; Round four + mov eax,DWORD PTR 12[ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR 12[edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR 12[edi],eax ; *r+=ax + mov esi,edx ; c = overflow + + add ebx,16 + add edi,16 + + dec ebp + jz $L666 + jmp $L546 +$L666: + mov ebp,DWORD PTR 28[esp] ; num + and ebp,3 ; num%4 + jz $L547 + + ; Round one + mov eax,DWORD PTR [ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR [edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR [edi],eax ; *r+=ax + mov esi,edx ; c = overflow + dec ebp + jz $L547 + ; Round two + mov eax,DWORD PTR 4[ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR 4[edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR 4[edi],eax ; *r+=ax + mov esi,edx ; c = overflow + dec ebp + jz $L547 + ; Round three + mov eax,DWORD PTR 8[ebx] ; edx:eax = *a * w + mul ecx + add eax,DWORD PTR 8[edi] ; *r+=ax + adc edx,0 + add eax,esi ; edx:eax += c + adc edx,0 + mov DWORD PTR 8[edi],eax ; *r+=ax + mov esi,edx ; c = overflow + +$L547: + mov eax,esi + pop edi + pop esi + pop ebx + pop ebp + ret +_bn_mul_add_word ENDP +_TEXT ENDS +PUBLIC _bn_mul_word +_TEXT SEGMENT +_bn_mul_word PROC NEAR + push ebp + push ebx + push esi + push edi + + mov edi,DWORD PTR 20[esp] ; r + mov ebx,DWORD PTR 24[esp] ; a + mov ebp,DWORD PTR 28[esp] ; num + mov ecx,DWORD PTR 32[esp] ; w + xor esi,esi ; c=0 + + shr ebp,2 ; num/4 + jz $L266 + +$L593: + ; Round one + mov eax,DWORD PTR [ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR [edi],eax ; *r=eax + mov esi,edx ; c=edx + ; Round two + mov eax,DWORD PTR 4[ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR 4[edi],eax ; *r=eax + mov esi,edx ; c=edx + ; Round three + mov eax,DWORD PTR 8[ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR 8[edi],eax ; *r=eax + mov esi,edx ; c=edx + ; Round four + mov eax,DWORD PTR 12[ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR 12[edi],eax ; *r=eax + mov esi,edx ; c=edx + + add ebx,16 + add edi,16 + + dec ebp + jz $L266 + jmp $L593 +$L266: + mov ebp,DWORD PTR 28[esp] ; num + and ebp,3 + jz $L601 + + ; Round one + mov eax,DWORD PTR [ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR [edi],eax ; *r=eax + mov esi,edx ; c=edx + dec ebp + jz $L601 + ; Round two + mov eax,DWORD PTR 4[ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR 4[edi],eax ; *r=eax + mov esi,edx ; c=edx + dec ebp + jz $L601 + ; Round three + mov eax,DWORD PTR 8[ebx] ; edx:eax= w * *a + mul ecx + add eax,esi ; edx:eax+=c + adc edx,0 + mov DWORD PTR 8[edi],eax ; *r=eax + mov esi,edx ; c=edx + +$L601: + mov eax,esi + pop edi + pop esi + pop ebx + pop ebp + ret +_bn_mul_word ENDP +_TEXT ENDS +PUBLIC _bn_sqr_words +_TEXT SEGMENT +_bn_sqr_words PROC NEAR + push ebx + push esi + push edi + mov esi,DWORD PTR 16[esp] ; r + mov edi,DWORD PTR 20[esp] ; a + mov ebx,DWORD PTR 24[esp] ; num + + shr ebx,2 ; num/4 + jz $L111 +$L640: + ; Round 1 + mov eax, DWORD PTR [edi] + mul eax ; *a * *a + mov DWORD PTR [esi],eax + mov DWORD PTR 4[esi],edx + ; Round 2 + mov eax, DWORD PTR 4[edi] + mul eax ; *a * *a + mov DWORD PTR 8[esi],eax + mov DWORD PTR 12[esi],edx + ; Round 3 + mov eax, DWORD PTR 8[edi] + mul eax ; *a * *a + mov DWORD PTR 16[esi],eax + mov DWORD PTR 20[esi],edx + ; Round 4 + mov eax, DWORD PTR 12[edi] + mul eax ; *a * *a + mov DWORD PTR 24[esi],eax + mov DWORD PTR 28[esi],edx + + add edi,16 + add esi,32 + + dec ebx + jz $L111 + jmp $L640 +$L111: + mov ebx,DWORD PTR 24[esp] ; num + and ebx,3 ; num%3 + jz $L645 + + ; Round 1 + mov eax, DWORD PTR [edi] + mul eax ; *a * *a + mov DWORD PTR [esi],eax + mov DWORD PTR 4[esi],edx + dec ebx + jz $L645 + ; Round 2 + mov eax, DWORD PTR 4[edi] + mul eax ; *a * *a + mov DWORD PTR 8[esi],eax + mov DWORD PTR 12[esi],edx + dec ebx + jz $L645 + ; Round 3 + mov eax, DWORD PTR 8[edi] + mul eax ; *a * *a + mov DWORD PTR 16[esi],eax + mov DWORD PTR 20[esi],edx + +$L645: + pop edi + pop esi + pop ebx + ret +_bn_sqr_words ENDP +_TEXT ENDS +PUBLIC _bn_div64 +_TEXT SEGMENT +_bn_div64 PROC NEAR + mov edx, DWORD PTR 4[esp] + mov eax, DWORD PTR 8[esp] + div DWORD PTR 12[esp] + ret +_bn_div64 ENDP +_TEXT ENDS +END diff --git a/crypto/bn/asm/x86nt32.uu b/crypto/bn/asm/x86nt32.uu new file mode 100644 index 0000000000..99207987c1 --- /dev/null +++ b/crypto/bn/asm/x86nt32.uu @@ -0,0 +1,22 @@ +begin 640 x86nt32.obj +M3`$"`/H&DC-6`@``"P`````````N=&5X=```````````````\@$``&0````` +M```````````````@`#!@+F1A=&$```#R`0````````````!6`@`````````` +M````````0``PP%535E>+?"04BUPD&(M,)"`S]HML)!S![0)T7(L#]^$#!X/2 +M``/&@](`B0>+\HM#!/?A`T<$@](``\:#T@")1P2+\HM#"/?A`T<(@](``\:# +MT@")1PB+\HM##/?A`T<,@](``\:#T@")1PR+\H/#$(/'$$UT`NNDBVPD'(/E +M`W1"BP/WX0,'@](``\:#T@")!XOR370MBT,$]^$#1P2#T@`#QH/2`(E'!(OR +M3705BT,(]^$#1PB#T@`#QH/2`(E'"(ORB\9?7EM=PU535E>+?"04BUPD&(ML +M)!R+3"0@,_;![0)T18L#]^$#QH/2`(D'B_*+0P3WX0/&@](`B4<$B_*+0PCW +MX0/&@](`B4<(B_*+0PSWX0/&@](`B4<,B_*#PQ"#QQ!-=`+KNXML)!R#Y0-T +M,8L#]^$#QH/2`(D'B_)-="&+0P3WX0/&@](`B4<$B_)-=`^+0PCWX0/&@](` +MB4<(B_*+QE]>6UW#4U97BW0D$(M\)!2+7"08P>L"=#6+!_?@B0:)5@2+1P3W +MX(E&"(E6#(M'"/?@B480B584BT<,]^")1AB)5AR#QQ"#QB!+=`+KRXM<)!B# +MXP-T)8L']^")!HE6!$MT&8M'!/?@B48(B58,2W0+BT<(]^")1A")5A1?7EO# +MBU0D!(M$)`CW="0,PRYF:6QE`````````/[_``!G`BY<8W)Y<'1O7&)N7&%S +M;5QX.#9N=#,R+F%S;0```````````"YT97AT``````````$````#`?(!```` +M`````````````````"YD871A``````````(````#`0`````````````````` +M```````````$``````````$`(``"```````5````R0````$`(``"```````B +M````:@$```$`(``"```````P````Y0$```$`(``"`#H```!?8FY?;75L7V%D +L9%]W;W)D`%]B;E]M=6Q?=V]R9`!?8FY?<W%R7W=O<F1S`%]B;E]D:78V-``` +` +end diff --git a/crypto/bn/asm/x86w16.asm b/crypto/bn/asm/x86w16.asm new file mode 100644 index 0000000000..66874913e9 --- /dev/null +++ b/crypto/bn/asm/x86w16.asm @@ -0,0 +1,297 @@ +; Static Name Aliases +; + TITLE bn_mulw.c + .8087 +F_TEXT SEGMENT WORD PUBLIC 'CODE' +F_TEXT ENDS +_DATA SEGMENT WORD PUBLIC 'DATA' +_DATA ENDS +CONST SEGMENT WORD PUBLIC 'CONST' +CONST ENDS +_BSS SEGMENT WORD PUBLIC 'BSS' +_BSS ENDS +DGROUP GROUP CONST, _BSS, _DATA + ASSUME DS: DGROUP, SS: DGROUP +F_TEXT SEGMENT + ASSUME CS: F_TEXT + PUBLIC _bn_mul_add_word +_bn_mul_add_word PROC FAR +; Line 58 + push bp + push bx + push si + push di + push ds + push es + mov bp,sp +; w = 26 +; num = 24 +; ap = 20 +; rp = 16 + xor si,si ;c=0; + mov di,WORD PTR [bp+16] ; load r + mov ds,WORD PTR [bp+18] ; load r + mov bx,WORD PTR [bp+20] ; load a + mov es,WORD PTR [bp+22] ; load a + mov cx,WORD PTR [bp+26] ; load w + mov bp,WORD PTR [bp+24] ; load num + + shr bp,1 ; div count by 4 and do groups of 4 + shr bp,1 + je $L555 + +$L546: + mov ax,cx + mul WORD PTR es:[bx] ; w* *a + add ax,WORD PTR ds:[di] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di],ax + mov si,dx + ; + mov ax,cx + mul WORD PTR es:[bx+2] ; w* *a + add ax,WORD PTR ds:[di+2] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di+2],ax + mov si,dx + ; + mov ax,cx + mul WORD PTR es:[bx+4] ; w* *a + add ax,WORD PTR ds:[di+4] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di+4],ax + mov si,dx + ; + mov ax,cx + mul WORD PTR es:[bx+6] ; w* *a + add ax,WORD PTR ds:[di+6] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di+6],ax + mov si,dx + ; + add bx,8 + add di,8 + ; + dec bp + je $L555 + jmp $L546 +; +; +$L555: + mov bp,sp + mov bp,WORD PTR [bp+24] ; load num + and bp,3 + dec bp + js $L547 + + mov ax,cx + mul WORD PTR es:[bx] ; w* *a + add ax,WORD PTR ds:[di] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di],ax + mov si,dx + dec bp + js $L547 ; Note that we are now testing for -1 + ; + mov ax,cx + mul WORD PTR es:[bx+2] ; w* *a + add ax,WORD PTR ds:[di+2] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di+2],ax + mov si,dx + dec bp + js $L547 + ; + mov ax,cx + mul WORD PTR es:[bx+4] ; w* *a + add ax,WORD PTR ds:[di+4] ; + *r + adc dx,0 + adc ax,si + adc dx,0 + mov WORD PTR ds:[di+4],ax + mov si,dx +$L547: + mov ax,si + pop es + pop ds + pop di + pop si + pop bx + pop bp + ret + nop + +_bn_mul_add_word ENDP + PUBLIC _bn_mul_word +_bn_mul_word PROC FAR +; Line 76 + push bp + push bx + push si + push di + push ds + push es + xor si,si + mov bp,sp + mov di,WORD PTR [bp+16] ; r + mov ds,WORD PTR [bp+18] + mov bx,WORD PTR [bp+20] ; a + mov es,WORD PTR [bp+22] + mov cx,WORD PTR [bp+26] ; w + mov bp,WORD PTR [bp+24] ; num +$FC743: + mov ax,cx + mul WORD PTR es:[bx] + add ax,si + adc dx,0 + mov WORD PTR ds:[di],ax + mov si,dx + dec bp + je $L764 + ; + mov ax,cx + mul WORD PTR es:[bx+2] + add ax,si + adc dx,0 + mov WORD PTR ds:[di+2],ax + mov si,dx + dec bp + je $L764 + ; + mov ax,cx + mul WORD PTR es:[bx+4] + add ax,si + adc dx,0 + mov WORD PTR ds:[di+4],ax + mov si,dx + dec bp + je $L764 + ; + mov ax,cx + mul WORD PTR es:[bx+6] + add ax,si + adc dx,0 + mov WORD PTR ds:[di+6],ax + mov si,dx + dec bp + je $L764 + ; + add bx,8 + add di,8 + jmp $FC743 + nop +$L764: + mov ax,si + pop es + pop ds + pop di + pop si + pop bx + pop bp + ret + nop +_bn_mul_word ENDP + PUBLIC _bn_sqr_words +_bn_sqr_words PROC FAR +; Line 92 + push bp + push bx + push si + push di + push ds + push es + mov bp,sp + mov si,WORD PTR [bp+16] + mov ds,WORD PTR [bp+18] + mov di,WORD PTR [bp+20] + mov es,WORD PTR [bp+22] + mov bx,WORD PTR [bp+24] + + mov bp,bx ; save a memory lookup later + shr bx,1 ; div count by 4 and do groups of 4 + shr bx,1 + je $L666 + +$L765: + mov ax,WORD PTR es:[di] + mul ax + mov WORD PTR ds:[si],ax + mov WORD PTR ds:[si+2],dx + ; + mov ax,WORD PTR es:[di+2] + mul ax + mov WORD PTR ds:[si+4],ax + mov WORD PTR ds:[si+6],dx + ; + mov ax,WORD PTR es:[di+4] + mul ax + mov WORD PTR ds:[si+8],ax + mov WORD PTR ds:[si+10],dx + ; + mov ax,WORD PTR es:[di+6] + mul ax + mov WORD PTR ds:[si+12],ax + mov WORD PTR ds:[si+14],dx + ; + add di,8 + add si,16 + dec bx + je $L666 + jmp $L765 +$L666: + and bp,3 + dec bp ; The copied value of bx (num) + js $L645 + ; + mov ax,WORD PTR es:[di] + mul ax + mov WORD PTR ds:[si],ax + mov WORD PTR ds:[si+2],dx + dec bp + js $L645 + ; + mov ax,WORD PTR es:[di+2] + mul ax + mov WORD PTR ds:[si+4],ax + mov WORD PTR ds:[si+6],dx + dec bp + js $L645 + ; + mov ax,WORD PTR es:[di+4] + mul ax + mov WORD PTR ds:[si+8],ax + mov WORD PTR ds:[si+10],dx +$L645: + pop es + pop ds + pop di + pop si + pop bx + pop bp + ret + +_bn_sqr_words ENDP + PUBLIC _bn_div64 +_bn_div64 PROC FAR + push bp + mov bp,sp + mov dx, WORD PTR [bp+6] + mov ax, WORD PTR [bp+8] + div WORD PTR [bp+10] + pop bp + ret +_bn_div64 ENDP +F_TEXT ENDS +END diff --git a/crypto/bn/asm/x86w16.uu b/crypto/bn/asm/x86w16.uu new file mode 100644 index 0000000000..89c5e144b7 --- /dev/null +++ b/crypto/bn/asm/x86w16.uu @@ -0,0 +1,20 @@ +begin 640 x86w16.obj +M@!P`&BY<8W)Y<'1O7&)N7&%S;5QX.#9W,38N87-MQY8U```$7T)34P5?1$%4 +M009$1U)/55`&1E]415A4!4-/3E-4`T)34P5#3TY35`1$051!!$-/1$5EF`<` +M2/`!!0H!&)@'`$@```,)`0R8!P!(```&"`$*F`<`2````@<!#YH(``3_`O\# +M_P14D$4```$-7V)N7W-Q<E]W;W)D<U4!``E?8FY?9&EV-C3B`0`07V)N7VUU +M;%]A9&1?=V]R9`````Q?8FY?;75L7W=O<F3<``#`B`0``*(!T:#T`0$``%53 +M5E<>!HOL,_:+?A".7A*+7A2.1A:+3AJ+;AC1[='M=&"+P2;W)P,%@](`$\:# +MT@")!8ORB\$F]V<"`T4"@](`$\:#T@")10*+\HO!)O=G!`-%!(/2`!/&@](` +MB44$B_*+P2;W9P8#10:#T@`3QH/2`(E%!HOR@\,(@\<(370"ZZ"+[(MN&(/E +M`TUX18O!)O<G`P6#T@`3QH/2`(D%B_)->"^+P2;W9P(#10*#T@`3QH/2`(E% +M`HOR37@6B\$F]V<$`T4$@](`$\:#T@")102+\HO&!Q]?7EM=RY!54U97'@8S +M]HOLBWX0CEX2BUX4CD86BTX:BVX8B\$F]R<#QH/2`(D%B_)-=$*+P2;W9P(# +MQH/2`(E%`HOR370OB\$F]V<$`\:#T@")102+\DUT'(O!)O=G!@/&@](`B44& +MB_)-=`F#PPB#QPCKKI"+Q@<?7UY;7<N055-65QX&B^R+=A".7A*+?A2.1A:+ +M7AB+Z]'KT>MT.2:+!??@B02)5`(FBT4"]^")1`2)5`8FBT4$]^")1`B)5`HF +MBT4&]^")1`R)5`Z#QPB#QA!+=`+KQX/E`TUX*":+!??@B02)5`)->!LFBT4" +M]^")1`2)5`9->`PFBT4$]^")1`B)5`H''U]>6UW+58OLBU8&BT8(]W8*7<NZ +%B@(``'0` +` +end diff --git a/crypto/bn/asm/x86w32.asm b/crypto/bn/asm/x86w32.asm new file mode 100644 index 0000000000..0e4452dfa9 --- /dev/null +++ b/crypto/bn/asm/x86w32.asm @@ -0,0 +1,303 @@ +; Static Name Aliases +; + TITLE bn_mulw.c + .386 +F_TEXT SEGMENT WORD USE16 PUBLIC 'CODE' +F_TEXT ENDS +_DATA SEGMENT WORD USE16 PUBLIC 'DATA' +_DATA ENDS +CONST SEGMENT WORD USE16 PUBLIC 'CONST' +CONST ENDS +_BSS SEGMENT WORD USE16 PUBLIC 'BSS' +_BSS ENDS +DGROUP GROUP CONST, _BSS, _DATA + ASSUME DS: DGROUP, SS: DGROUP +F_TEXT SEGMENT + ASSUME CS: F_TEXT + PUBLIC _bn_mul_add_word +_bn_mul_add_word PROC FAR +; Line 58 + push bp + push bx + push esi + push di + push ds + push es + mov bp,sp +; w = 28 +; num = 26 +; ap = 22 +; rp = 18 + xor esi,esi ;c=0; + mov di,WORD PTR [bp+18] ; load r + mov ds,WORD PTR [bp+20] ; load r + mov bx,WORD PTR [bp+22] ; load a + mov es,WORD PTR [bp+24] ; load a + mov ecx,DWORD PTR [bp+28] ; load w + mov bp,WORD PTR [bp+26] ; load num + shr bp,1 ; div count by 4 and do groups of 4 + shr bp,1 + je $L555 + +$L546: + mov eax,ecx + mul DWORD PTR es:[bx] ; w* *a + add eax,DWORD PTR ds:[di] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di],eax + mov esi,edx + ; + mov eax,ecx + mul DWORD PTR es:[bx+4] ; w* *a + add eax,DWORD PTR ds:[di+4] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di+4],eax + mov esi,edx + ; + mov eax,ecx + mul DWORD PTR es:[bx+8] ; w* *a + add eax,DWORD PTR ds:[di+8] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di+8],eax + mov esi,edx + ; + mov eax,ecx + mul DWORD PTR es:[bx+12] ; w* *a + add eax,DWORD PTR ds:[di+12] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di+12],eax + mov esi,edx + ; + add bx,16 + add di,16 + ; + dec bp + je $L555 + jmp $L546 +; +; +$L555: + mov bp,sp + mov bp,WORD PTR [bp+26] ; load num + and bp,3 + dec bp + js $L547 + + mov eax,ecx + mul DWORD PTR es:[bx] ; w* *a + add eax,DWORD PTR ds:[di] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di],eax + mov esi,edx + dec bp + js $L547 ; Note that we are now testing for -1 + ; + mov eax,ecx + mul DWORD PTR es:[bx+4] ; w* *a + add eax,DWORD PTR ds:[di+4] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di+4],eax + mov esi,edx + dec bp + js $L547 + ; + mov eax,ecx + mul DWORD PTR es:[bx+8] ; w* *a + add eax,DWORD PTR ds:[di+8] ; + *r + adc edx,0 + adc eax,esi + adc edx,0 + mov DWORD PTR ds:[di+8],eax + mov esi,edx +$L547: + mov eax,esi + mov edx,esi + shr edx,16 + pop es + pop ds + pop di + pop esi + pop bx + pop bp + ret + nop + +_bn_mul_add_word ENDP + PUBLIC _bn_mul_word +_bn_mul_word PROC FAR +; Line 76 + push bp + push bx + push esi + push di + push ds + push es + xor esi,esi + mov bp,sp + mov di,WORD PTR [bp+18] ; r + mov ds,WORD PTR [bp+20] + mov bx,WORD PTR [bp+22] ; a + mov es,WORD PTR [bp+24] + mov ecx,DWORD PTR [bp+28] ; w + mov bp,WORD PTR [bp+26] ; num + +$FC743: + mov eax,ecx + mul DWORD PTR es:[bx] + add eax,esi + adc edx,0 + mov DWORD PTR ds:[di],eax + mov esi,edx + dec bp + je $L764 + ; + mov eax,ecx + mul DWORD PTR es:[bx+4] + add eax,esi + adc edx,0 + mov DWORD PTR ds:[di+4],eax + mov esi,edx + dec bp + je $L764 + ; + mov eax,ecx + mul DWORD PTR es:[bx+8] + add eax,esi + adc edx,0 + mov DWORD PTR ds:[di+8],eax + mov esi,edx + dec bp + je $L764 + ; + mov eax,ecx + mul DWORD PTR es:[bx+12] + add eax,esi + adc edx,0 + mov DWORD PTR ds:[di+12],eax + mov esi,edx + dec bp + je $L764 + ; + add bx,16 + add di,16 + jmp $FC743 + nop +$L764: + mov eax,esi + mov edx,esi + shr edx,16 + pop es + pop ds + pop di + pop esi + pop bx + pop bp + ret + nop +_bn_mul_word ENDP + PUBLIC _bn_sqr_words +_bn_sqr_words PROC FAR +; Line 92 + push bp + push bx + push si + push di + push ds + push es + mov bp,sp + mov si,WORD PTR [bp+16] + mov ds,WORD PTR [bp+18] + mov di,WORD PTR [bp+20] + mov es,WORD PTR [bp+22] + mov bx,WORD PTR [bp+24] + + mov bp,bx ; save a memory lookup later + shr bx,1 ; div count by 4 and do groups of 4 + shr bx,1 + je $L666 + +$L765: + mov eax,DWORD PTR es:[di] + mul eax + mov DWORD PTR ds:[si],eax + mov DWORD PTR ds:[si+4],edx + ; + mov eax,DWORD PTR es:[di+4] + mul eax + mov DWORD PTR ds:[si+8],eax + mov DWORD PTR ds:[si+12],edx + ; + mov eax,DWORD PTR es:[di+8] + mul eax + mov DWORD PTR ds:[si+16],eax + mov DWORD PTR ds:[si+20],edx + ; + mov eax,DWORD PTR es:[di+12] + mul eax + mov DWORD PTR ds:[si+24],eax + mov DWORD PTR ds:[si+28],edx + ; + add di,16 + add si,32 + dec bx + je $L666 + jmp $L765 +$L666: + and bp,3 + dec bp ; The copied value of bx (num) + js $L645 + ; + mov eax,DWORD PTR es:[di] + mul eax + mov DWORD PTR ds:[si],eax + mov DWORD PTR ds:[si+4],edx + dec bp + js $L645 + ; + mov eax,DWORD PTR es:[di+4] + mul eax + mov DWORD PTR ds:[si+8],eax + mov DWORD PTR ds:[si+12],edx + dec bp + js $L645 + ; + mov eax,DWORD PTR es:[di+8] + mul eax + mov DWORD PTR ds:[si+16],eax + mov DWORD PTR ds:[si+20],edx +$L645: + pop es + pop ds + pop di + pop si + pop bx + pop bp + ret + +_bn_sqr_words ENDP + PUBLIC _bn_div64 +_bn_div64 PROC FAR + push bp + mov bp,sp + mov edx, DWORD PTR [bp+6] + mov eax, DWORD PTR [bp+10] + div DWORD PTR [bp+14] + mov edx,eax + shr edx,16 + pop bp + ret +_bn_div64 ENDP +F_TEXT ENDS +END diff --git a/crypto/bn/asm/x86w32.uu b/crypto/bn/asm/x86w32.uu new file mode 100644 index 0000000000..edcd84e25e --- /dev/null +++ b/crypto/bn/asm/x86w32.uu @@ -0,0 +1,23 @@ +begin 640 x86w32.obj +M@!P`&BY<8W)Y<'1O7&)N7&%S;5QX.#9W,S(N87-MR98U```$7T)34P5?1$%4 +M009$1U)/55`&1E]415A4!4-/3E-4`T)34P5#3TY35`1$051!!$-/1$5EF`<` +M2(`"!0H!AY@'`$@```,)`0R8!P!(```&"`$*F`<`2````@<!#YH(``3_`O\# +M_P14D$4```$-7V)N7W-Q<E]W;W)D<[\!``E?8FY?9&EV-C1H`@`07V)N7VUU +M;%]A9&1?=V]R9`````Q?8FY?;75L7W=O<F0B`0"(B`0``*(!T:"$`@$``%53 +M9E97'@:+[&8S]HM^$HY>%(M>%HY&&&:+3AR+;AK1[='M#X2``&:+P68F]R=F +M`P5F@](`9A/&9H/2`&:)!6:+\F:+P68F]V<$9@-%!&:#T@!F$\9F@](`9HE% +M!&:+\F:+P68F]V<(9@-%"&:#T@!F$\9F@](`9HE%"&:+\F:+P68F]V<,9@-% +M#&:#T@!F$\9F@](`9HE%#&:+\H/#$(/'$$UT`NN`B^R+;AJ#Y0-->%UFB\%F +M)O<G9@,%9H/2`&83QF:#T@!FB05FB_)->#]FB\%F)O=G!&8#101F@](`9A/& +M9H/2`&:)101FB_)->!YFB\%F)O=G"&8#10AF@](`9A/&9H/2`&:)10AFB_)F +MB\9FB]9FP>H0!Q]?9EY;7<N055-F5E<>!F8S]HOLBWX2CEX4BUX6CD889HM. +M'(MN&F:+P68F]R=F`\9F@](`9HD%9HOR37149HO!9B;W9P1F`\9F@](`9HE% +M!&:+\DUT.V:+P68F]V<(9@/&9H/2`&:)10AFB_)-=")FB\%F)O=G#&8#QF:# +MT@!FB44,9HOR370)@\,0@\<0ZY:09HO&9HO69L'J$`<?7V9>6UW+D%535E<> +M!HOLBW80CEX2BWX4CD86BUX8B^O1Z]'K=$EF)HL%9O?@9HD$9HE4!&8FBT4$ +M9O?@9HE$"&:)5`QF)HM%"&;WX&:)1!!FB5049B:+10QF]^!FB4089HE4'(/' +M$(/&($MT`NNW@^4#37@T9B:+!6;WX&:)!&:)5`1->"-F)HM%!&;WX&:)1`AF +MB50,37@09B:+10AF]^!FB4009HE4%`<?7UY;7<M5B^QFBU8&9HM&"F;W=@YF +.B]!FP>H07<O`B@(``'0` +` +end |