aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2016-11-25 13:11:09 +0100
committerDr. Stephen Henson <steve@openssl.org>2017-08-30 21:26:43 +0100
commit7d91d9ea6b2281d847e6dcb05e6e3bbd88b60404 (patch)
tree5356dbda3ea32c812d5e15c6de88b3a08f8834fd
parente1a9268d81238aa12acfb9725a13c858c8937cd7 (diff)
downloadopenssl-7d91d9ea6b2281d847e6dcb05e6e3bbd88b60404.tar.gz
Add some C64x assembly modules [by minor adjustments of C64x+ modules].
AES, SHA256 and SHA512 modules can actually replace corresponding C64x+ modules. This is because C64x+ instructions don't actually provide "killer-argument" advantage in these modules. As for SHA1, even though its performance exactly same, C64x+ module is more responsive to interrupts, i.e. doesn't inhibit them for as long periods as C64x module. Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Stephen Henson <steve@openssl.org> (Merged from https://github.com/openssl/openssl/pull/4265) (cherry picked from commit 5526e5791f1426553b6f4806d1ac82efd6ab33bc)
-rw-r--r--crypto/aes/asm/aes-c64x.pl1375
-rw-r--r--crypto/c64xcpuid.pl326
-rw-r--r--crypto/sha/asm/sha1-c64x-large.pl230
-rw-r--r--crypto/sha/asm/sha1-c64x.pl330
-rw-r--r--crypto/sha/asm/sha256-c64x.pl313
-rw-r--r--crypto/sha/asm/sha512-c64x.pl437
6 files changed, 3011 insertions, 0 deletions
diff --git a/crypto/aes/asm/aes-c64x.pl b/crypto/aes/asm/aes-c64x.pl
new file mode 100644
index 0000000000..0817128c1b
--- /dev/null
+++ b/crypto/aes/asm/aes-c64x.pl
@@ -0,0 +1,1375 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x.
+#
+# Even though loops are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*) This means that there might be subtle correlation between data
+# and timing and one can wonder if it can be ... attacked:-(
+# On the other hand this also means that *if* one chooses to
+# implement *4* T-tables variant [instead of 1 T-table as in
+# this implementation, or in addition to], then one ought to
+# *interleave* them. Even though it complicates addressing,
+# references to interleaved tables would be guaranteed not to
+# clash. I reckon that it should be possible to break 8 cycles
+# per byte "barrier," i.e. improve by ~20%, naturally at the
+# cost of 8x increased pressure on L1D. 8x because you'd have
+# to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg AES_encrypt,_AES_encrypt
+ .asg AES_decrypt,_AES_decrypt
+ .asg AES_set_encrypt_key,_AES_set_encrypt_key
+ .asg AES_set_decrypt_key,_AES_set_decrypt_key
+ .asg AES_ctr32_encrypt,_AES_ctr32_encrypt
+ .endif
+
+ .asg B3,RA
+ .asg A4,INP
+ .asg B4,OUT
+ .asg A6,KEY
+ .asg A4,RET
+ .asg B15,SP
+
+ .eval 24,EXT0
+ .eval 16,EXT1
+ .eval 8,EXT2
+ .eval 0,EXT3
+ .eval 8,TBL1
+ .eval 16,TBL2
+ .eval 24,TBL3
+
+ .if .BIG_ENDIAN
+ .eval 24-EXT0,EXT0
+ .eval 24-EXT1,EXT1
+ .eval 24-EXT2,EXT2
+ .eval 24-EXT3,EXT3
+ .eval 32-TBL1,TBL1
+ .eval 32-TBL2,TBL2
+ .eval 32-TBL3,TBL3
+ .endif
+
+ .global _AES_encrypt
+_AES_encrypt:
+ .asmfunc
+ MVK 1,B2
+__encrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Te-__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Te-__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
+ LDW *$KPA++[2],$Te0[0] ; zero round key
+|| LDW *$KPB++[2],$Te0[1]
+|| MVK 60,A0
+|| ADD B0,$TEA,$TEA ; AES_Te
+ LDW *KEY[A0],B0 ; rounds
+|| MVK 1024,A0 ; sizeof(AES_Te)
+ LDW *$KPA++[2],$Te0[2]
+|| LDW *$KPB++[2],$Te0[3]
+|| MV $TEA,$TEB
+ NOP
+ .if .BIG_ENDIAN
+ MV A9,$s[0]
+|| MV A8,$s[1]
+|| MV B9,$s[2]
+|| MV B8,$s[3]
+ .else
+ MV A8,$s[0]
+|| MV A9,$s[1]
+|| MV B8,$s[2]
+|| MV B9,$s[3]
+ .endif
+ XOR $Te0[0],$s[0],$s[0]
+|| XOR $Te0[1],$s[1],$s[1]
+|| LDW *$KPA++[2],$K[0] ; 1st round key
+|| LDW *$KPB++[2],$K[1]
+
+ LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+|| EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[0],EXT3,24,$Te3[0]
+|| SUB B0,1,B0
+;;====================================================================
+enc_loop?:
+ LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
+|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
+|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[1],EXT3,24,$Te3[1]
+|| EXTU $s[0],EXT1,24,$Te1[0]
+ LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2
+|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3
+|| EXTU $s[2],EXT2,24,$Te2[2]
+|| EXTU $s[3],EXT2,24,$Te2[3]
+ LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0
+|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1
+|| EXTU $s[3],EXT3,24,$Te3[3]
+|| EXTU $s[2],EXT1,24,$Te1[2]
+ LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0
+|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1
+|| EXTU $s[0],EXT2,24,$Te2[0]
+|| EXTU $s[1],EXT2,24,$Te2[1]
+ LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2
+|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3
+|| EXTU $s[3],EXT1,24,$Te1[3]
+|| EXTU $s[2],EXT3,24,$Te3[2]
+ LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2
+|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3
+|| ROTL $Te1[1],TBL1,$Te3[0] ; t0
+|| ROTL $Te3[0],TBL3,$Te1[1] ; t1
+|| EXTU $s[0],EXT0,24,$Te0[0]
+|| EXTU $s[1],EXT0,24,$Te0[1]
+ LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0
+|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1
+|| ROTL $Te3[1],TBL3,$Te1[0] ; t2
+|| ROTL $Te1[0],TBL1,$Te3[1] ; t3
+|| EXTU $s[2],EXT0,24,$Te0[2]
+|| EXTU $s[3],EXT0,24,$Te0[3]
+|| [B0] SUB B0,1,B0
+ LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
+|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
+|| ROTL $Te2[2],TBL2,$Te2[2] ; t0
+|| ROTL $Te2[3],TBL2,$Te2[3] ; t1
+|| XOR $K[0],$Te3[0],$s[0]
+|| XOR $K[1],$Te1[1],$s[1]
+|| [B0] BNOP enc_loop?
+ ROTL $Te3[3],TBL3,$Te1[2] ; t0
+|| ROTL $Te1[2],TBL1,$Te3[3] ; t1
+|| XOR $K[2],$Te1[0],$s[2]
+|| XOR $K[3],$Te3[1],$s[3]
+|| LDW *$KPA++[2],$K[0] ; next round key
+|| LDW *$KPB++[2],$K[1]
+ ROTL $Te2[0],TBL2,$Te2[0] ; t2
+|| ROTL $Te2[1],TBL2,$Te2[1] ; t3
+|| XOR $s[0],$Te2[2],$s[0]
+|| XOR $s[1],$Te2[3],$s[1]
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+ ROTL $Te1[3],TBL1,$Te3[2] ; t2
+|| ROTL $Te3[2],TBL3,$Te1[3] ; t3
+|| XOR $s[0],$Te1[2],$s[0]
+|| XOR $s[1],$Te3[3],$s[1]
+ XOR $s[2],$Te2[0],$s[2]
+|| XOR $s[3],$Te2[1],$s[3]
+|| XOR $s[0],$Te0[0],$s[0]
+|| XOR $s[1],$Te0[1],$s[1]
+ XOR $s[2],$Te3[2],$s[2]
+|| XOR $s[3],$Te1[3],$s[3]
+|| EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[0],EXT3,24,$Te3[0]
+||[!B0] ADD ${TEA},A0,${TEA} ; point to Te4
+||[!B0] ADD ${TEB},A0,${TEB}
+;;====================================================================
+ LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
+|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
+|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[0],EXT0,24,$Te0[0]
+|| EXTU $s[1],EXT0,24,$Te0[1]
+ LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0
+|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1
+|| EXTU $s[3],EXT3,24,$Te3[3]
+|| EXTU $s[2],EXT1,24,$Te1[2]
+ LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0
+|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1
+|| EXTU $s[2],EXT2,24,$Te2[2]
+|| EXTU $s[3],EXT2,24,$Te2[3]
+ LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0
+|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1
+|| EXTU $s[1],EXT3,24,$Te3[1]
+|| EXTU $s[0],EXT1,24,$Te1[0]
+ LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2
+|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3
+|| EXTU $s[3],EXT1,24,$Te1[3]
+|| EXTU $s[2],EXT3,24,$Te3[2]
+ LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2
+|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3
+|| EXTU $s[2],EXT0,24,$Te0[2]
+|| EXTU $s[3],EXT0,24,$Te0[3]
+ LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
+|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
+|| EXTU $s[0],EXT2,24,$Te2[0]
+|| EXTU $s[1],EXT2,24,$Te2[1]
+ LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2
+|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3
+
+ .if .BIG_ENDIAN
+ PACK2 $Te0[0],$Te1[1],$Te0[0]
+|| PACK2 $Te0[1],$Te1[2],$Te0[1]
+ PACK2 $Te2[2],$Te3[3],$Te2[2]
+|| PACK2 $Te2[3],$Te3[0],$Te2[3]
+ PACKL4 $Te0[0],$Te2[2],$Te0[0]
+|| PACKL4 $Te0[1],$Te2[3],$Te0[1]
+ XOR $K[0],$Te0[0],$Te0[0] ; s[0]
+|| XOR $K[1],$Te0[1],$Te0[1] ; s[1]
+
+ PACK2 $Te0[2],$Te1[3],$Te0[2]
+|| PACK2 $Te0[3],$Te1[0],$Te0[3]
+ PACK2 $Te2[0],$Te3[1],$Te2[0]
+|| PACK2 $Te2[1],$Te3[2],$Te2[1]
+|| BNOP RA
+ PACKL4 $Te0[2],$Te2[0],$Te0[2]
+|| PACKL4 $Te0[3],$Te2[1],$Te0[3]
+ XOR $K[2],$Te0[2],$Te0[2] ; s[2]
+|| XOR $K[3],$Te0[3],$Te0[3] ; s[3]
+
+ MV $Te0[0],A9
+|| MV $Te0[1],A8
+ MV $Te0[2],B9
+|| MV $Te0[3],B8
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .else
+ PACK2 $Te1[1],$Te0[0],$Te1[1]
+|| PACK2 $Te1[2],$Te0[1],$Te1[2]
+ PACK2 $Te3[3],$Te2[2],$Te3[3]
+|| PACK2 $Te3[0],$Te2[3],$Te3[0]
+ PACKL4 $Te3[3],$Te1[1],$Te1[1]
+|| PACKL4 $Te3[0],$Te1[2],$Te1[2]
+ XOR $K[0],$Te1[1],$Te1[1] ; s[0]
+|| XOR $K[1],$Te1[2],$Te1[2] ; s[1]
+
+ PACK2 $Te1[3],$Te0[2],$Te1[3]
+|| PACK2 $Te1[0],$Te0[3],$Te1[0]
+ PACK2 $Te3[1],$Te2[0],$Te3[1]
+|| PACK2 $Te3[2],$Te2[1],$Te3[2]
+|| BNOP RA
+ PACKL4 $Te3[1],$Te1[3],$Te1[3]
+|| PACKL4 $Te3[2],$Te1[0],$Te1[0]
+ XOR $K[2],$Te1[3],$Te1[3] ; s[2]
+|| XOR $K[3],$Te1[0],$Te1[0] ; s[3]
+
+ MV $Te1[1],A8
+|| MV $Te1[2],A9
+ MV $Te1[3],B8
+|| MV $Te1[0],B9
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .endif
+ .endasmfunc
+
+ .global _AES_decrypt
+_AES_decrypt:
+ .asmfunc
+ MVK 1,B2
+__decrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Td-__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Td-__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
+ LDW *$KPA++[2],$Td0[0] ; zero round key
+|| LDW *$KPB++[2],$Td0[1]
+|| MVK 60,A0
+|| ADD B0,$TEA,$TEA ; AES_Td
+ LDW *KEY[A0],B0 ; rounds
+|| MVK 1024,A0 ; sizeof(AES_Td)
+ LDW *$KPA++[2],$Td0[2]
+|| LDW *$KPB++[2],$Td0[3]
+|| MV $TEA,$TEB
+ NOP
+ .if .BIG_ENDIAN
+ MV A9,$s[0]
+|| MV A8,$s[1]
+|| MV B9,$s[2]
+|| MV B8,$s[3]
+ .else
+ MV A8,$s[0]
+|| MV A9,$s[1]
+|| MV B8,$s[2]
+|| MV B9,$s[3]
+ .endif
+ XOR $Td0[0],$s[0],$s[0]
+|| XOR $Td0[1],$s[1],$s[1]
+|| LDW *$KPA++[2],$K[0] ; 1st round key
+|| LDW *$KPB++[2],$K[1]
+
+ LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+|| EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[0],EXT1,24,$Td1[0]
+|| SUB B0,1,B0
+;;====================================================================
+dec_loop?:
+ LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
+|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
+|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[1],EXT1,24,$Td1[1]
+|| EXTU $s[0],EXT3,24,$Td3[0]
+ LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2
+|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3
+|| EXTU $s[2],EXT2,24,$Td2[2]
+|| EXTU $s[3],EXT2,24,$Td2[3]
+ LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0
+|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1
+|| EXTU $s[3],EXT1,24,$Td1[3]
+|| EXTU $s[2],EXT3,24,$Td3[2]
+ LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0
+|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1
+|| EXTU $s[0],EXT2,24,$Td2[0]
+|| EXTU $s[1],EXT2,24,$Td2[1]
+ LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2
+|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3
+|| EXTU $s[3],EXT3,24,$Td3[3]
+|| EXTU $s[2],EXT1,24,$Td1[2]
+ LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2
+|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3
+|| ROTL $Td3[1],TBL3,$Td1[0] ; t0
+|| ROTL $Td1[0],TBL1,$Td3[1] ; t1
+|| EXTU $s[0],EXT0,24,$Td0[0]
+|| EXTU $s[1],EXT0,24,$Td0[1]
+ LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0
+|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1
+|| ROTL $Td1[1],TBL1,$Td3[0] ; t2
+|| ROTL $Td3[0],TBL3,$Td1[1] ; t3
+|| EXTU $s[2],EXT0,24,$Td0[2]
+|| EXTU $s[3],EXT0,24,$Td0[3]
+|| [B0] SUB B0,1,B0
+ LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
+|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
+|| ROTL $Td2[2],TBL2,$Td2[2] ; t0
+|| ROTL $Td2[3],TBL2,$Td2[3] ; t1
+|| XOR $K[0],$Td1[0],$s[0]
+|| XOR $K[1],$Td3[1],$s[1]
+|| [B0] BNOP dec_loop?
+ ROTL $Td1[3],TBL1,$Td3[2] ; t0
+|| ROTL $Td3[2],TBL3,$Td1[3] ; t1
+|| XOR $K[2],$Td3[0],$s[2]
+|| XOR $K[3],$Td1[1],$s[3]
+|| LDW *$KPA++[2],$K[0] ; next round key
+|| LDW *$KPB++[2],$K[1]
+ ROTL $Td2[0],TBL2,$Td2[0] ; t2
+|| ROTL $Td2[1],TBL2,$Td2[1] ; t3
+|| XOR $s[0],$Td2[2],$s[0]
+|| XOR $s[1],$Td2[3],$s[1]
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+ ROTL $Td3[3],TBL3,$Td1[2] ; t2
+|| ROTL $Td1[2],TBL1,$Td3[3] ; t3
+|| XOR $s[0],$Td3[2],$s[0]
+|| XOR $s[1],$Td1[3],$s[1]
+ XOR $s[2],$Td2[0],$s[2]
+|| XOR $s[3],$Td2[1],$s[3]
+|| XOR $s[0],$Td0[0],$s[0]
+|| XOR $s[1],$Td0[1],$s[1]
+ XOR $s[2],$Td1[2],$s[2]
+|| XOR $s[3],$Td3[3],$s[3]
+|| EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[0],EXT1,24,$Td1[0]
+||[!B0] ADD ${TEA},A0,${TEA} ; point to Td4
+||[!B0] ADD ${TEB},A0,${TEB}
+;;====================================================================
+ LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
+|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
+|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[0],EXT0,24,$Td0[0]
+|| EXTU $s[1],EXT0,24,$Td0[1]
+ LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0
+|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1
+|| EXTU $s[2],EXT2,24,$Td2[2]
+|| EXTU $s[3],EXT2,24,$Td2[3]
+ LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0
+|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1
+|| EXTU $s[3],EXT1,24,$Td1[3]
+|| EXTU $s[2],EXT3,24,$Td3[2]
+ LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0
+|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1
+|| EXTU $s[1],EXT1,24,$Td1[1]
+|| EXTU $s[0],EXT3,24,$Td3[0]
+ LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2
+|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3
+|| EXTU $s[0],EXT2,24,$Td2[0]
+|| EXTU $s[1],EXT2,24,$Td2[1]
+ LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2
+|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3
+|| EXTU $s[3],EXT3,24,$Td3[3]
+|| EXTU $s[2],EXT1,24,$Td1[2]
+ LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2
+|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3
+|| EXTU $s[2],EXT0,24,$Td0[2]
+|| EXTU $s[3],EXT0,24,$Td0[3]
+ LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
+|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
+
+ .if .BIG_ENDIAN
+ PACK2 $Td0[0],$Td1[3],$Td0[0]
+|| PACK2 $Td0[1],$Td1[0],$Td0[1]
+ PACK2 $Td2[2],$Td3[1],$Td2[2]
+|| PACK2 $Td2[3],$Td3[2],$Td2[3]
+ PACKL4 $Td0[0],$Td2[2],$Td0[0]
+|| PACKL4 $Td0[1],$Td2[3],$Td0[1]
+ XOR $K[0],$Td0[0],$Td0[0] ; s[0]
+|| XOR $K[1],$Td0[1],$Td0[1] ; s[1]
+
+ PACK2 $Td0[2],$Td1[1],$Td0[2]
+|| PACK2 $Td0[3],$Td1[2],$Td0[3]
+ PACK2 $Td2[0],$Td3[3],$Td2[0]
+|| PACK2 $Td2[1],$Td3[0],$Td2[1]
+|| BNOP RA
+ PACKL4 $Td0[2],$Td2[0],$Td0[2]
+|| PACKL4 $Td0[3],$Td2[1],$Td0[3]
+ XOR $K[2],$Td0[2],$Td0[2] ; s[2]
+|| XOR $K[3],$Td0[3],$Td0[3] ; s[3]
+
+ MV $Td0[0],A9
+|| MV $Td0[1],A8
+ MV $Td0[2],B9
+|| MV $Td0[3],B8
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .else
+ PACK2 $Td1[3],$Td0[0],$Td1[3]
+|| PACK2 $Td1[0],$Td0[1],$Td1[0]
+ PACK2 $Td3[1],$Td2[2],$Td3[1]
+|| PACK2 $Td3[2],$Td2[3],$Td3[2]
+ PACKL4 $Td3[1],$Td1[3],$Td1[3]
+|| PACKL4 $Td3[2],$Td1[0],$Td1[0]
+ XOR $K[0],$Td1[3],$Td1[3] ; s[0]
+|| XOR $K[1],$Td1[0],$Td1[0] ; s[1]
+
+ PACK2 $Td1[1],$Td0[2],$Td1[1]
+|| PACK2 $Td1[2],$Td0[3],$Td1[2]
+ PACK2 $Td3[3],$Td2[0],$Td3[3]
+|| PACK2 $Td3[0],$Td2[1],$Td3[0]
+|| BNOP RA
+ PACKL4 $Td3[3],$Td1[1],$Td1[1]
+|| PACKL4 $Td3[0],$Td1[2],$Td1[2]
+ XOR $K[2],$Td1[1],$Td1[1] ; s[2]
+|| XOR $K[3],$Td1[2],$Td1[2] ; s[3]
+
+ MV $Td1[3],A8
+|| MV $Td1[0],A9
+ MV $Td1[1],B8
+|| MV $Td1[2],B9
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .endif
+ .endasmfunc
+___
+{
+my @K=(@K,@s); # extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0; # used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+ .asg OUT,BITS
+
+ .global _AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+ .asmfunc
+ MV INP,A0
+|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8
+|| MV KEY,A1
+ [!A0] B RA
+||[!A0] MVK -1,RET
+||[!A0] MVK 1,A1 ; only one B RA
+ [!A1] B RA
+||[!A1] MVK -1,RET
+||[!A1] MVK 0,A0
+|| MVK 0,B0
+|| MVK 0,A1
+ [A0] LDNDW *INP++,A9:A8
+|| [A0] CMPEQ 4,BITS,B0
+|| [A0] CMPLT 3,BITS,A1
+ [B0] B key128?
+|| [A1] LDNDW *INP++,B9:B8
+|| [A0] CMPEQ 6,BITS,B0
+|| [A0] CMPLT 5,BITS,A1
+ [B0] B key192?
+|| [A1] LDNDW *INP++,B17:B16
+|| [A0] CMPEQ 8,BITS,B0
+|| [A0] CMPLT 7,BITS,A1
+ [B0] B key256?
+|| [A1] LDNDW *INP++,B19:B18
+
+ .if __TI_EABI__
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .else
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL (AES_Te4-__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH (AES_Te4-__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .endif
+ NOP
+ NOP
+
+ BNOP RA,5
+|| MVK -2,RET ; unknown bit length
+|| MVK 0,B0 ; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$Te4[2]
+|| MV B8,$K[3]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$Te4[2]
+|| MV B9,$K[3]
+ .endif
+
+ MVK 256,A0
+|| MVK 8,B0
+
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop128?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[2]
+|| EXTU $K[3],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[3],A0
+|| EXTU $K[3],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[3],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+
+ XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| BDEC loop128?,B0
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| BDEC loop128?,B0
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+ XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+;;====================================================================
+ BNOP RA
+ MV $Te4[2],$K[2]
+|| STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 10,B0 ; rounds
+ STW B0,*++${KPB}[15]
+ MVK 0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$K[2]
+|| MV B8,$K[3]
+ MV B17,$Te4[2]
+|| MV B16,$K[5]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$K[2]
+|| MV B9,$K[3]
+ MV B16,$Te4[2]
+|| MV B17,$K[5]
+ .endif
+
+ MVK 256,A0
+|| MVK 6,B0
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop192?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[4]
+|| EXTU $K[5],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[5],A0
+|| EXTU $K[5],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[5],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ STW $K[4],*$KPA++[2]
+|| STW $K[5],*$KPB++[2]
+
+ XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+ BDEC loop192?,B0
+|| XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+ MV $Te4[2],$K[2]
+|| XOR $K[3],$K[4],$Te4[2] ; K[4]
+ XOR $Te4[2],$K[5],$K[5] ; K[5]
+;;====================================================================
+ BNOP RA
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 12,B0 ; rounds
+ STW B0,*++${KPB}[7]
+ MVK 0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$K[2]
+|| MV B8,$K[3]
+ MV B17,$K[4]
+|| MV B16,$K[5]
+|| MV B19,$Te4[2]
+|| MV B18,$K[7]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$K[2]
+|| MV B9,$K[3]
+ MV B16,$K[4]
+|| MV B17,$K[5]
+|| MV B18,$Te4[2]
+|| MV B19,$K[7]
+ .endif
+
+ MVK 256,A0
+|| MVK 6,B0
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop256?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[6]
+|| EXTU $K[7],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[7],A0
+|| EXTU $K[7],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[7],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ STW $K[4],*$KPA++[2]
+|| STW $K[5],*$KPB++[2]
+ STW $K[6],*$KPA++[2]
+|| STW $K[7],*$KPB++[2]
+|| XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+||[!B0] B done256?
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+||[!B0] B done256?
+ .endif
+ XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+
+ MV $Te4[2],$K[2]
+|| [B0] EXTU $K[3],EXT0,24,$Te4[0]
+|| [B0] SUB B0,1,B0
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[3],A0
+|| EXTU $K[3],EXT1,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT2,24,A0
+|| EXTU $K[3],EXT3,24,$Te4[3]
+
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ NOP 3
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| B loop256?
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ NOP 3
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| B loop256?
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+
+ XOR $Te4[3],$K[4],$Te4[0] ; K[4]
+ XOR $Te4[0],$K[5],$K[5] ; K[5]
+ MV $Te4[0],$K[4]
+|| XOR $K[5],$K[6],$Te4[2] ; K[6]
+ XOR $Te4[2],$K[7],$K[7] ; K[7]
+;;====================================================================
+done256?:
+ BNOP RA
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 14,B0 ; rounds
+ STW B0,*--${KPB}[1]
+ MVK 0,RET
+ .endasmfunc
+
+ .global _AES_set_decrypt_key
+_AES_set_decrypt_key:
+ .asmfunc
+ B __set_encrypt_key ; guarantee local call
+ MV KEY,B30 ; B30 is not modified
+ MV RA, B31 ; B31 is not modified
+ ADDKPC ret?,RA,2
+ret?: ; B0 holds rounds or zero
+ [!B0] BNOP B31 ; return if zero
+ [B0] SHL B0,4,A0 ; offset to last round key
+ [B0] SHRU B0,1,B2
+ [B0] SUB B2,2,B2
+|| [B0] MVK 0x0000001B,B3 ; AES polynomial
+ [B0] MVKH 0x07000000,B3
+|| [B0] MV B30,$KPA
+ [B0] ADD B30,A0,$KPB
+|| [B0] MVK 16,A0 ; sizeof(round key)
+;;====================================================================
+flip_loop?:
+ LDW *${KPA}[0],A16
+|| LDW *${KPB}[0],B16
+ LDW *${KPA}[1],A17
+|| LDW *${KPB}[1],B17
+ LDW *${KPA}[2],A18
+|| LDW *${KPB}[2],B18
+ LDW *${KPA}[3],A19
+|| ADD $KPA,A0,$KPA
+|| LDW *${KPB}[3],B19
+|| SUB $KPB,A0,$KPB
+|| BDEC flip_loop?,B2
+ NOP
+ STW B16,*${KPA}[-4]
+|| STW A16,*${KPB}[4]
+ STW B17,*${KPA}[-3]
+|| STW A17,*${KPB}[5]
+ STW B18,*${KPA}[-2]
+|| STW A18,*${KPB}[6]
+ STW B19,*${KPA}[-1]
+|| STW A19,*${KPB}[7]
+;;====================================================================
+ SUB B0,1,B0 ; skip last round
+|| ADD B30,A0,$KPA ; skip first round
+|| ADD B30,A0,$KPB
+|| MVC GFPGFR,B30 ; save GFPGFR
+ LDW *${KPA}[0],$K[0]
+|| LDW *${KPB}[1],$K[1]
+|| MVC B3,GFPGFR
+ LDW *${KPA}[2],$K[2]
+|| LDW *${KPB}[3],$K[3]
+ MVK 0x00000909,A24
+|| MVK 0x00000B0B,B24
+ MVKH 0x09090000,A24
+|| MVKH 0x0B0B0000,B24
+ SUB B0,1,B0
+
+ GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| GMPY4 $K[1],A24,$Kx9[1]
+|| MVK 0x00000D0D,A25
+|| MVK 0x00000E0E,B25
+ GMPY4 $K[2],A24,$Kx9[2]
+|| GMPY4 $K[3],A24,$Kx9[3]
+|| MVKH 0x0D0D0000,A25
+|| MVKH 0x0E0E0000,B25
+
+ GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| GMPY4 $K[1],B24,$KxB[1]
+ GMPY4 $K[2],B24,$KxB[2]
+|| GMPY4 $K[3],B24,$KxB[3]
+
+;;====================================================================
+invmix_loop?:
+ GMPY4 $K[0],A25,$KxD[0] ; ·0x0D
+|| GMPY4 $K[1],A25,$KxD[1]
+|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16
+|| SWAP2 $Kx9[1],$Kx9[1]
+|| MV $K[0],$s[0] ; this or DINT
+|| MV $K[1],$s[1]
+|| [B0] LDW *${KPA}[4],$K[0]
+|| [B0] LDW *${KPB}[5],$K[1]
+ GMPY4 $K[2],A25,$KxD[2]
+|| GMPY4 $K[3],A25,$KxD[3]
+|| SWAP2 $Kx9[2],$Kx9[2]
+|| SWAP2 $Kx9[3],$Kx9[3]
+|| MV $K[2],$s[2]
+|| MV $K[3],$s[3]
+|| [B0] LDW *${KPA}[6],$K[2]
+|| [B0] LDW *${KPB}[7],$K[3]
+
+ GMPY4 $s[0],B25,$KxE[0] ; ·0x0E
+|| GMPY4 $s[1],B25,$KxE[1]
+|| XOR $Kx9[0],$KxB[0],$KxB[0]
+|| XOR $Kx9[1],$KxB[1],$KxB[1]
+ GMPY4 $s[2],B25,$KxE[2]
+|| GMPY4 $s[3],B25,$KxE[3]
+|| XOR $Kx9[2],$KxB[2],$KxB[2]
+|| XOR $Kx9[3],$KxB[3],$KxB[3]
+
+ ROTL $KxB[0],TBL3,$KxB[0]
+|| ROTL $KxB[1],TBL3,$KxB[1]
+|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16
+|| SWAP2 $KxD[1],$KxD[1]
+ ROTL $KxB[2],TBL3,$KxB[2]
+|| ROTL $KxB[3],TBL3,$KxB[3]
+|| SWAP2 $KxD[2],$KxD[2]
+|| SWAP2 $KxD[3],$KxD[3]
+|| [B0] B invmix_loop?
+
+ XOR $KxE[0],$KxD[0],$KxE[0]
+|| XOR $KxE[1],$KxD[1],$KxE[1]
+|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| [B0] GMPY4 $K[1],A24,$Kx9[1]
+|| ADDAW $KPA,4,$KPA
+ XOR $KxE[2],$KxD[2],$KxE[2]
+|| XOR $KxE[3],$KxD[3],$KxE[3]
+|| [B0] GMPY4 $K[2],A24,$Kx9[2]
+|| [B0] GMPY4 $K[3],A24,$Kx9[3]
+|| ADDAW $KPB,4,$KPB
+
+ XOR $KxB[0],$KxE[0],$KxE[0]
+|| XOR $KxB[1],$KxE[1],$KxE[1]
+|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| [B0] GMPY4 $K[1],B24,$KxB[1]
+ XOR $KxB[2],$KxE[2],$KxE[2]
+|| XOR $KxB[3],$KxE[3],$KxE[3]
+|| [B0] GMPY4 $K[2],B24,$KxB[2]
+|| [B0] GMPY4 $K[3],B24,$KxB[3]
+|| STW $KxE[0],*${KPA}[-4]
+|| STW $KxE[1],*${KPB}[-3]
+ STW $KxE[2],*${KPA}[-2]
+|| STW $KxE[3],*${KPB}[-1]
+|| [B0] SUB B0,1,B0
+;;====================================================================
+ BNOP B31,3
+ MVC B30,GFPGFR ; restore GFPGFR(*)
+ MVK 0,RET
+ .endasmfunc
+___
+# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there
+# are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+ .global _AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+ .asmfunc
+ LDNDW *${ivp}[0],A31:A30 ; load counter value
+|| MV $blocks,A2 ; reassign $blocks
+|| MV RA,B27 ; reassign RA
+|| MV $key,B26 ; reassign $key
+ LDNDW *${ivp}[1],B31:B30
+|| MVK 0,B2 ; don't let __encrypt load input
+|| MVK 0,A1 ; and postpone writing output
+ .if .BIG_ENDIAN
+ NOP
+ .else
+ NOP 4
+ SWAP2 B31,B31 ; keep least significant 32 bits
+ SWAP4 B31,B31 ; in host byte order
+ .endif
+ctr32_loop?:
+ [A2] BNOP __encrypt
+|| [A1] XOR A29,A9,A9 ; input^Ek(counter)
+|| [A1] XOR A28,A8,A8
+|| [A2] LDNDW *INP++,A29:A28 ; load input
+ [!A2] BNOP B27 ; return
+|| [A1] XOR B29,B9,B9
+|| [A1] XOR B28,B8,B8
+|| [A2] LDNDW *INP++,B29:B28
+ .if .BIG_ENDIAN
+ [A1] STNDW A9:A8,*OUT++ ; save output
+|| [A2] MV A31,A9 ; pass counter value to __encrypt
+|| [A2] MV A30,A8 ; pass counter value to __encrypt
+ [A1] STNDW B9:B8,*OUT++
+|| [A2] DMV B31,B30,B9:B8
+|| [A2] ADD B30,1,B30 ; counter++
+ .else
+ [A1] STNDW A9:A8,*OUT++ ; save output
+|| [A2] MV A31,A9
+|| [A2] MV A30,A8
+|| [A2] SWAP2 B31,B0
+|| [A2] ADD B31,1,B31 ; counter++
+ [A1] STNDW B9:B8,*OUT++
+|| [A2] MV B30,B8
+|| [A2] SWAP4 B0,B9
+ .endif
+ [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop?
+|| [A2] MV B26,KEY ; pass $key
+|| [A2] SUB A2,1,A2 ; $blocks--
+||[!A1] MVK 1,A1
+ NOP
+ NOP
+ .endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+ .if __TI_EABI__
+ .sect ".text:aes_asm.const"
+ .else
+ .sect ".const:aes_asm"
+ .endif
+ .align 128
+AES_Te:
+ .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84
+ .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
+ .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
+ .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
+ .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
+ .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
+ .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
+ .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
+ .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
+ .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
+ .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
+ .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
+ .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
+ .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
+ .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
+ .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
+ .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
+ .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
+ .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
+ .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
+ .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
+ .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
+ .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
+ .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
+ .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
+ .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
+ .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
+ .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
+ .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
+ .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
+ .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
+ .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
+ .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
+ .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
+ .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
+ .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
+ .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
+ .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
+ .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
+ .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
+ .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
+ .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
+ .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
+ .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
+ .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
+ .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
+ .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
+ .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
+ .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
+ .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
+ .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
+ .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
+ .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
+ .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
+ .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
+ .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
+ .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
+ .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
+ .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
+ .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
+ .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
+ .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
+ .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
+ .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
+ .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
+ .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
+ .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
+ .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
+ .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
+ .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
+ .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
+ .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
+ .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
+ .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
+ .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
+ .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
+ .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
+ .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
+ .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
+ .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
+ .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
+ .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
+ .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
+ .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
+ .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
+ .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
+ .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
+ .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
+ .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
+ .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
+ .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
+ .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
+ .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
+ .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
+ .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
+ .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
+ .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
+ .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
+ .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
+ .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
+ .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
+ .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
+ .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
+ .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
+ .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
+ .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
+ .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
+ .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
+ .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
+ .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
+ .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
+ .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
+ .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
+ .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
+ .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
+ .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
+ .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
+ .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
+ .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
+ .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
+ .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
+ .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
+ .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
+ .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
+ .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
+ .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
+ .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
+ .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
+AES_Te4:
+ .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+ .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00
+ .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
+ .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
+ .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
+ .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
+ .align 128
+AES_Td:
+ .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53
+ .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
+ .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
+ .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
+ .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
+ .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
+ .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
+ .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
+ .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
+ .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
+ .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
+ .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
+ .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
+ .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
+ .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
+ .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
+ .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
+ .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
+ .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
+ .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
+ .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
+ .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
+ .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
+ .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
+ .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
+ .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
+ .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
+ .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
+ .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
+ .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
+ .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
+ .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
+ .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
+ .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
+ .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
+ .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
+ .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
+ .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
+ .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
+ .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
+ .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
+ .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
+ .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
+ .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
+ .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
+ .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
+ .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
+ .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
+ .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
+ .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
+ .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
+ .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
+ .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
+ .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
+ .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
+ .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
+ .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
+ .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
+ .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
+ .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
+ .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
+ .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
+ .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
+ .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
+ .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
+ .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
+ .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
+ .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
+ .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
+ .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
+ .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
+ .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
+ .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
+ .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
+ .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
+ .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
+ .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
+ .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
+ .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
+ .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
+ .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
+ .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
+ .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
+ .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
+ .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
+ .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
+ .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
+ .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
+ .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
+ .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
+ .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
+ .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
+ .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
+ .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
+ .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
+ .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
+ .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
+ .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
+ .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
+ .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
+ .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
+ .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
+ .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
+ .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
+ .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
+ .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
+ .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
+ .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
+ .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
+ .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
+ .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
+ .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
+ .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
+ .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
+ .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
+ .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
+ .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
+ .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
+ .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
+ .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
+ .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
+ .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
+ .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
+ .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
+ .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
+ .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
+ .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
+ .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
+AES_Td4:
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .cstring "AES for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/c64xcpuid.pl b/crypto/c64xcpuid.pl
new file mode 100644
index 0000000000..88fd153b98
--- /dev/null
+++ b/crypto/c64xcpuid.pl
@@ -0,0 +1,326 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg OPENSSL_rdtsc,_OPENSSL_rdtsc
+ .asg OPENSSL_cleanse,_OPENSSL_cleanse
+ .asg CRYPTO_memcmp,_CRYPTO_memcmp
+ .asg OPENSSL_atomic_add,_OPENSSL_atomic_add
+ .asg OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+ .asg OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+ .asg OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+ .endif
+
+ .asg B3,RA
+ .asg 0x01AC0000,TIMER_BASE ; Timer 2
+
+ .global _OPENSSL_rdtsc
+_OPENSSL_rdtsc:
+ .asmfunc
+ MVKL TIMER_BASE,A5
+ MVKH TIMER_BASE,A5
+ LDW *A5[0],A2 ; load CTL
+ LDW *A5[2],A4 ; load CTN
+ NOP 2
+ .if .BIG_ENDIAN
+ MVK 0x2c0,A7 ; internal clock source, don't hold, go
+|| MVK -1,A6 ; maximum period
+ .else
+ MVK 0x2c0,A6 ; internal clock source, don't hold, go
+|| MVK -1,A7 ; maximum period
+ .endif
+ [!A2] STDW A7:A6,*A5[0] ; fire it up
+|| BNOP RA,5
+ .endasmfunc
+
+ .global _OPENSSL_cleanse
+_OPENSSL_cleanse:
+ .asmfunc
+ ZERO A3:A2
+|| ZERO B2
+|| SHRU B4,3,B0 ; is length >= 8
+|| ADD 1,A4,B6
+ [!B0] BNOP RA
+|| [B0] SUB B0,1,B2
+|| ZERO A1
+|| ZERO B1
+ [B2] BDEC cleanse_loop?,B2
+||[!B0] CMPLT 0,B4,A1
+||[!B0] CMPLT 1,B4,B1
+|| ZERO B5
+ [A1] STB A2,*A4++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B2] BDEC cleanse_loop?,B2
+||[!B0] CMPLT 2,B4,A1
+||[!B0] CMPLT 3,B4,B1
+ [A1] STB A2,*A4++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B2] BDEC cleanse_loop?,B2
+||[!B0] CMPLT 4,B4,A1
+||[!B0] CMPLT 5,B4,B1
+ [A1] STB A2,*A4++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B2] BDEC cleanse_loop?,B2
+||[!B0] CMPLT 6,B4,A1
+ [A1] STB A2,*A4++[2]
+|| [B2] BDEC cleanse_loop?,B2
+
+cleanse_loop?:
+ STNDW A3:A2,*A4++
+|| SUB B4,8,B4
+|| [B2] BDEC cleanse_loop?,B2
+
+ MV B4,B0 ; remaining bytes
+|| ADD 1,A4,B6
+|| BNOP RA
+ [B0] CMPLT 0,B0,A1
+|| [B0] CMPLT 1,B0,B1
+ [A1] STB A2,*A4++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B0] CMPLT 2,B0,A1
+|| [B0] CMPLT 3,B0,B1
+ [A1] STB A2,*A4++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B0] CMPLT 4,B0,A1
+|| [B0] CMPLT 5,B0,B1
+ [A1] STB A2,*A4++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B0] CMPLT 6,B0,A1
+ [A1] STB A2,*A4++[2]
+ .endasmfunc
+
+ .if 0
+ .global _CRYPTO_memcmp
+_CRYPTO_memcmp:
+ .asmfunc
+ MV A6,B0
+ [!B0] BNOP RA
+||[!B0] ZERO A4
+|| [B0] ZERO A1:A0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ XOR A5,B5,A1
+|| [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+
+memcmp_loop?:
+ OR A1,A0,A0
+|| XOR A5,B5,A1
+|| [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+
+ BNOP RA,3
+ ZERO A4
+ [A0] MVK 1,A4
+ .endasmfunc
+ .endif
+
+ .global _OPENSSL_atomic_add
+_OPENSSL_atomic_add:
+ .asmfunc
+ BNOP atomic_store? ; pre-C64x+ systems are uni-processor, it's
+|| LDW *A4,B5 ; enough to hold interrupts off through
+ ; the load-update-store cycle to achieve
+ ; atomicity
+ NOP
+ BNOP RA,3 ; and this branch stretches even over store
+ ADD B4,B5,B5
+atomic_store?:
+ STW B5,*A4
+|| MV B5,A4
+ .endasmfunc
+
+ .global _OPENSSL_wipe_cpu
+_OPENSSL_wipe_cpu:
+ .asmfunc
+ ZERO A0
+|| ZERO B0
+|| ZERO A1
+|| ZERO B1
+ ZERO A3:A2
+|| MVD B0,B2
+|| ZERO A4
+|| ZERO B4
+|| ZERO A5
+|| ZERO B5
+|| BNOP RA
+ ZERO A7:A6
+|| ZERO B7:B6
+|| ZERO A8
+|| ZERO B8
+|| ZERO A9
+|| ZERO B9
+ ZERO A17:A16
+|| ZERO B17:B16
+|| ZERO A18
+|| ZERO B18
+|| ZERO A19
+|| ZERO B19
+ ZERO A21:A20
+|| ZERO B21:B20
+|| ZERO A22
+|| ZERO B22
+|| ZERO A23
+|| ZERO B23
+ ZERO A25:A24
+|| ZERO B25:B24
+|| ZERO A26
+|| ZERO B26
+|| ZERO A27
+|| ZERO B27
+ ZERO A29:A28
+|| ZERO B29:B28
+|| ZERO A30
+|| ZERO B30
+|| ZERO A31
+|| ZERO B31
+ .endasmfunc
+
+CLFLUSH .macro CONTROL,ADDR,LEN
+ B passthrough?
+|| STW ADDR,*CONTROL[0]
+ STW LEN,*CONTROL[1]
+spinlock?:
+ LDW *CONTROL[1],A0
+ NOP 3
+passthrough?:
+ NOP
+ [A0] BNOP spinlock?,5
+ .endm
+
+ .global _OPENSSL_instrument_bus
+_OPENSSL_instrument_bus:
+ .asmfunc
+ MV B4,B0 ; reassign sizeof(output)
+|| MV A4,B4 ; reassign output
+|| MVK 0x00004030,A3
+|| MVKL TIMER_BASE,B16
+ MV B0,A4 ; return value
+|| MVK 1,A1
+|| MVKH 0x01840000,A3 ; L1DWIBAR
+|| MVKH TIMER_BASE,B16
+ LDW *B16[2],B8 ; collect 1st tick
+|| MVK 0x00004010,A5
+ NOP 4
+ MV B8,B9 ; lasttick = tick
+|| MVK 0,B7 ; lastdiff = 0
+|| MVKH 0x01840000,A5 ; L2WIBAR
+ CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
+ CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
+ LDW *B4,B5
+ NOP 4
+ ADD B7,B5,B5
+ STW B5,*B4
+bus_loop1?:
+ LDW *B16[2],B8
+|| [B0] SUB B0,1,B0
+ NOP 4
+ SUB B8,B9,B7 ; lastdiff = tick - lasttick
+|| MV B8,B9 ; lasttick = tick
+ CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
+ CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
+ LDW *B4,B5
+ NOP 4
+ ADD B7,B5,B5
+ STW B5,*B4 ; [!B1] is removed to flatten samples
+|| ADDK 4,B4
+|| [B0] BNOP bus_loop1?,5
+
+ BNOP RA,5
+ .endasmfunc
+
+ .global _OPENSSL_instrument_bus2
+_OPENSSL_instrument_bus2:
+ .asmfunc
+ MV A6,B0 ; reassign max
+|| MV B4,A6 ; reassing sizeof(output)
+|| MVK 0x00004030,A3
+|| MVKL TIMER_BASE,B16
+ MV A4,B4 ; reassign output
+|| MVK 0,A4 ; return value
+|| MVK 1,A1
+|| MVKH 0x01840000,A3 ; L1DWIBAR
+|| MVKH TIMER_BASE,B16
+
+ LDW *B16[2],B8 ; collect 1st tick
+|| MVK 0x00004010,A5
+ NOP 4
+ MV B8,B9 ; lasttick = tick
+|| MVK 0,B7 ; lastdiff = 0
+|| MVKH 0x01840000,A5 ; L2WIBAR
+ CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
+ CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
+ LDW *B4,B5
+ NOP 4
+ ADD B7,B5,B5
+ STW B5,*B4
+
+ LDW *B16[2],B8 ; collect 1st diff
+ NOP 4
+ SUB B8,B9,B7 ; lastdiff = tick - lasttick
+|| MV B8,B9 ; lasttick = tick
+|| SUB B0,1,B0
+bus_loop2?:
+ CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
+ CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
+ LDW *B4,B5
+ NOP 4
+ ADD B7,B5,B5
+ STW B5,*B4 ; [!B1] is removed to flatten samples
+||[!B0] BNOP bus_loop2_done?,2
+|| SUB B0,1,B0
+ LDW *B16[2],B8
+ NOP 4
+ SUB B8,B9,B8
+|| MV B8,B9
+ CMPEQ B8,B7,B2
+|| MV B8,B7
+ [!B2] ADDAW B4,1,B4
+||[!B2] ADDK 1,A4
+ CMPEQ A4,A6,A2
+ [!A2] BNOP bus_loop2?,5
+
+bus_loop2_done?:
+ BNOP RA,5
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".init_array"
+ .else
+ .sect ".pinit"
+ .endif
+ .align 4
+ .long _OPENSSL_rdtsc ; auto-start timer
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64x-large.pl b/crypto/sha/asm/sha1-c64x-large.pl
new file mode 100644
index 0000000000..3916ff3a3f
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x-large.pl
@@ -0,0 +1,230 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# This is fully-unrolled SHA1 implementation. It's 25% faster than
+# one with compact loops, doesn't use in-memory ring buffer, as
+# everything is accomodated in registers, and has "perfect" interrupt
+# agility. Drawback is obviously the code size...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
+@V = ($A,$B,$C,$D,$E);
+@X = map("B$_",(16..31));
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___ if ($i<14);
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| LDNW *${INP}++,@X[$i+2]
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| SWAP2 @X[$i+1],@X[$i+1]
+|| ADD @X[$i],$e,$e ; E+=X[i]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| SWAP4 @X[$i+1],@X[$i+1]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+$code.=<<___ if ($i==14);
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i],$e,$e ; E+=X[i]
+|| SWAP2 @X[$i+1],@X[$i+1]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| SWAP4 @X[$i+1],@X[$i+1]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+$code.=<<___ if ($i==15);
+|| XOR @X[($j+2)&15],@X[$j],@X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15],@X[$j],@X[$j]
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i],$e,$e ; E+=X[i]
+|| XOR @X[($j+13)&15],@X[$j],@X[$j]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1,@X[$j]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+$code.=<<___ if ($i>15);
+|| XOR @X[($j+2)&15],@X[$j],@X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15],@X[$j],@X[$j]
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+|| XOR @X[($j+13)&15],@X[$j],@X[$j]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1,@X[$j]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___ if ($i<79);
+|| XOR @X[($j+2)&15],@X[$j],@X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| XOR $c,$b,$F
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15],@X[$j],@X[$j]
+ XOR $d,$F,$F ; F_20_39(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+|| XOR @X[($j+13)&15],@X[$j],@X[$j]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1,@X[$j]
+ ADD $F,$e,$e ; E+=F_20_39(B,C,D)
+___
+$code.=<<___ if ($i==79);
+|| [A0] B loop?
+|| [A0] LDNW *${INP}++,@X[0] ; pre-fetch input
+ ROTL $a,5,$Arot ;; $i
+|| XOR $c,$b,$F
+|| ADD $K,$e,$e ; E+=K
+|| [A0] LDNW *${INP}++,@X[1]
+ XOR $d,$F,$F ; F_20_39(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+ ADD $F,$e,$e ; E+=F_20_39(B,C,D)
+|| ADD $Bctx,$a,$a ; accumulate context
+|| ADD $Cctx,$b,$b
+ ADD $Dctx,$c,$c
+|| ADD $Ectx,$d,$d
+|| ADD $Actx,$e,$e
+;;===== branch to loop? is taken here
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___;
+|| XOR @X[($j+2)&15],@X[$j],@X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| AND $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15],@X[$j],@X[$j]
+ XOR $F0,$F,$F
+|| AND $c,$d,$F0
+|| ROTL $b,30,$b
+|| XOR @X[($j+13)&15],@X[$j],@X[$j]
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+ XOR $F0,$F,$F ; F_40_59(B,C,D)
+|| ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1,@X[$j]
+ ADD $F,$e,$e ; E+=F_20_39(B,C,D)
+___
+}
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg sha1_block_data_order,_sha1_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg MV,SWAP2
+ .asg MV,SWAP4
+ .endif
+
+ .global _sha1_block_data_order
+_sha1_block_data_order:
+ .asmfunc
+ MV $NUM,A0 ; reassign $NUM
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] LDW *${CTX}[0],$A ; load A-E...
+ [A0] LDW *${CTX}[1],$B
+ [A0] LDW *${CTX}[2],$C
+ [A0] LDW *${CTX}[3],$D
+ [A0] LDW *${CTX}[4],$E
+ [A0] LDNW *${INP}++,@X[0] ; pre-fetch input
+ [A0] LDNW *${INP}++,@X[1]
+ NOP 3
+
+loop?:
+ SUB A0,1,A0
+|| MV $A,$Actx
+|| MVD $B,$Bctx
+|| SWAP2 @X[0],@X[0]
+|| MVKL 0x5a827999,$K
+ MVKH 0x5a827999,$K ; K_00_19
+|| MV $C,$Cctx
+|| MV $D,$Dctx
+|| MVD $E,$Ectx
+|| SWAP4 @X[0],@X[0]
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+|| MVKL 0x6ed9eba1,$K
+ MVKH 0x6ed9eba1,$K ; K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+|| MVKL 0x8f1bbcdc,$K
+ MVKH 0x8f1bbcdc,$K ; K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+|| MVKL 0xca62c1d6,$K
+ MVKH 0xca62c1d6,$K ; K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ BNOP RA ; return
+ STW $A,*${CTX}[0] ; emit A-E...
+ STW $B,*${CTX}[1]
+ STW $C,*${CTX}[2]
+ STW $D,*${CTX}[3]
+ STW $E,*${CTX}[4]
+ .endasmfunc
+
+ .sect .const
+ .cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64x.pl b/crypto/sha/asm/sha1-c64x.pl
new file mode 100644
index 0000000000..d7a9dd1d05
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Unlike its predecessor, sha1-c64xplus module, this module has worse
+# interrupt agility. While original added up to 5 cycles delay to
+# response to interrupt, this module adds up to 100. Fully unrolled
+# implementation doesn't add any delay and even 25% faster, but is
+# almost 5x larger...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5"); # X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg sha1_block_data_order,_sha1_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg MV,SWAP2
+ .asg MV,SWAP4
+ .endif
+
+ .global _sha1_block_data_order
+_sha1_block_data_order:
+ .asmfunc stack_usage(64)
+ MV $NUM,A0 ; reassign $NUM
+|| MVK -64,B0
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
+|| [A0] MV SP,FP
+ [A0] LDW *${CTX}[0],$A ; load A-E...
+|| [A0] AND B0,SP,SP ; align stack at 64 bytes
+ [A0] LDW *${CTX}[1],$B
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ [A0] LDW *${CTX}[2],$C
+|| [A0] MVK 0x00404,B0
+ [A0] LDW *${CTX}[3],$D
+|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
+ [A0] LDW *${CTX}[4],$E
+|| [A0] MVC B0,AMR ; setup circular addressing
+ LDNW *${INP}++,$TX1 ; pre-fetch input
+ NOP 1
+
+loop?:
+ MVKL 0x5a827999,$K
+|| ADDAW SP,2,$XPB
+|| SUB A0,1,A0
+ MVKH 0x5a827999,$K ; K_00_19
+|| MV $A,$Actx
+|| MV $B,$Bctx
+;;==================================================
+ B body_00_13? ; BODY_00_13
+|| MVK 11,B0
+|| MV $XPB,$XPA
+|| MV $C,$Cctx
+|| MV $D,$Dctx
+|| MVD $E,$Ectx
+
+body_00_13?:
+ ROTL $A,5,$Arot
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+|| SWAP2 $TX1,$TX2
+|| LDNW *${INP}++,$TX1
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| SWAP4 $TX2,$TX3 ; byte swap
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+
+ ADD $TX3,$T,$A ; A=T+Xi
+|| STW $TX3,*${XPB}++
+|| BDEC body_00_13?,B0
+;;==================================================
+ ROTL $A,5,$Arot ; BODY_14
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+|| SWAP2 $TX1,$TX2
+|| LDNW *${INP}++,$TX1
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| SWAP4 $TX2,$TX2 ; byte swap
+|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are
+|| LDW *${XPB}[4],$X2 ; 2 iterations ahead
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+;;==================================================
+ ROTL $A,5,$Arot ; BODY_15
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+|| SWAP2 $TX1,$TX2
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| SWAP4 $TX2,$TX2 ; byte swap
+|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+|| XOR $TX0,$TX1,$TX1
+;;==================================================
+|| B body_16_19? ; BODY_16_19
+|| MVK 1,B0
+
+body_16_19?:
+ ROTL $A,5,$Arot
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| XOR $X0,$X2,$TX0
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+|| XOR $TX0,$TX1,$TX1
+|| BDEC body_16_19?,B0
+
+ MVKL 0x6ed9eba1,$K
+|| MVK 17,B0
+ MVKH 0x6ed9eba1,$K ; K_20_39
+___
+sub BODY_20_39 {
+my $label = shift;
+$code.=<<___;
+;;==================================================
+|| B $label ; BODY_20_39
+
+$label:
+ ROTL $A,5,$Arot
+|| XOR $B,$C,$F
+|| ADD $K,$E,$T ; T=E+K
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ XOR $D,$F,$F ; F_20_39(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+
+ ADD $F,$T,$T ; T+=F_20_39(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| XOR $X0,$X2,$TX0
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++ ; last one is redundant
+|| XOR $TX0,$TX1,$TX1
+|| BDEC $label,B0
+___
+} &BODY_20_39("body_20_39?");
+$code.=<<___;
+;;==================================================
+ MVKL 0x8f1bbcdc,$K
+|| MVK 17,B0
+ MVKH 0x8f1bbcdc,$K ; K_40_59
+|| B body_40_59? ; BODY_40_59
+|| AND $B,$C,$F
+|| AND $B,$D,$F0
+
+body_40_59?:
+ ROTL $A,5,$Arot
+|| XOR $F0,$F,$F
+|| AND $C,$D,$F0
+|| ADD $K,$E,$T ; T=E+K
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ XOR $F0,$F,$F ; F_40_59(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+
+ ADD $F,$T,$T ; T+=F_40_59(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| XOR $X0,$X2,$TX0
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+|| XOR $TX0,$TX1,$TX1
+|| AND $B,$C,$F
+|| AND $B,$D,$F0
+|| BDEC body_40_59?,B0
+
+ MVKL 0xca62c1d6,$K
+|| MVK 16,B0
+ MVKH 0xca62c1d6,$K ; K_60_79
+___
+ &BODY_20_39("body_60_78?"); # BODY_60_78
+$code.=<<___;
+;;==================================================
+ [A0] B loop?
+|| ROTL $A,5,$Arot ; BODY_79
+|| XOR $B,$C,$F
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
+|| ADD $K,$E,$T ; T=E+K
+|| XOR $D,$F,$F ; F_20_39(B,C,D)
+
+ ADD $F,$T,$T ; T+=F_20_39(B,C,D)
+|| ADD $Ectx,$D,$E ; E=D,E+=Ectx
+|| ADD $Dctx,$C,$D ; D=C,D+=Dctx
+|| ROTL $B,30,$C ; C=ROL(B,30)
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| ADD $Bctx,$A,$B ; B=A,B+=Bctx
+
+ ADD $TX2,$T,$A ; A=T+Xi
+
+ ADD $Actx,$A,$A ; A+=Actx
+|| ADD $Cctx,$C,$C ; C+=Cctx
+;; end of loop?
+
+ BNOP RA ; return
+|| MV FP,SP ; restore stack pointer
+|| LDW *FP[0],FP ; restore frame pointer
+ STW $A,*${CTX}[0] ; emit A-E...
+|| MVK 0,B0
+ STW $B,*${CTX}[1]
+|| MVC B0,AMR ; clear AMR
+ STW $C,*${CTX}[2]
+ STW $D,*${CTX}[3]
+ STW $E,*${CTX}[4]
+ .endasmfunc
+
+ .sect .const
+ .cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha256-c64x.pl b/crypto/sha/asm/sha256-c64x.pl
new file mode 100644
index 0000000000..fbe99c0b7f
--- /dev/null
+++ b/crypto/sha/asm/sha256-c64x.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x.
+#
+# November 2016
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+ =map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+ =map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5"); # circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg sha256_block_data_order,_sha256_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg SWAP2,MV
+ .asg SWAP4,MV
+ .endif
+
+ .global _sha256_block_data_order
+_sha256_block_data_order:
+__sha256_block:
+ .asmfunc stack_usage(64)
+ MV $NUM,A0 ; reassign $NUM
+|| MVK -64,B0
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
+|| [A0] MV SP,FP
+ [A0] ADDKPC _sha256_block_data_order,B2
+|| [A0] AND B0,SP,SP ; align stack at 64 bytes
+ .if __TI_EABI__
+ [A0] MVK 0x00404,B1
+|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
+ [A0] MVKH 0x50000,B1
+|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
+ .else
+ [A0] MVK 0x00404,B1
+|| [A0] MVKL (K256-__sha256_block),$K256
+ [A0] MVKH 0x50000,B1
+|| [A0] MVKH (K256-__sha256_block),$K256
+ .endif
+ [A0] MVC B1,AMR ; setup circular addressing
+|| [A0] MV SP,$Xia
+ [A0] MV SP,$Xib
+|| [A0] ADD B2,$K256,$K256
+|| [A0] MV $CTXA,$CTXB
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ LDW *${CTXA}[0],$A ; load ctx
+|| LDW *${CTXB}[4],$E
+ LDW *${CTXA}[1],$B
+|| LDW *${CTXB}[5],$F
+ LDW *${CTXA}[2],$C
+|| LDW *${CTXB}[6],$G
+ LDW *${CTXA}[3],$D
+|| LDW *${CTXB}[7],$H
+
+ LDNW *$INP++,$Xn ; pre-fetch input
+ LDW *$K256++,$K ; pre-fetch K256[0]
+ NOP
+ ADDAW $Xia,9,$Xia
+outerloop?:
+ SUB A0,1,A0
+|| MV $A,$Actx
+|| MV $E,$Ectx
+|| MVD $B,$Bctx
+|| MVD $F,$Fctx
+ MV $C,$Cctx
+|| MV $G,$Gctx
+|| MVD $D,$Dctx
+|| MVD $H,$Hctx
+|| SWAP4 $Xn,$X0
+
+ MVK 14,B0 ; loop counter
+|| SWAP2 $X0,$X0
+
+loop_00_14?: ; BODY_00_14
+ LDNW *$INP++,$Xn
+|| ROTL $A,30,$S0
+|| OR $A,$B,$Maj
+|| AND $A,$B,$t2a
+|| ROTL $E,26,$S1
+|| AND $F,$E,$Ch
+|| ANDN $G,$E,$t2e
+ ROTL $A,19,$t0a
+|| AND $C,$Maj,$Maj
+|| ROTL $E,21,$t0e
+|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
+ ROTL $A,10,$t1a
+|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ROTL $E,7,$t1e
+|| ADD $K,$H,$T1 ; T1 = h + K256[i]
+|| [B0] BDEC loop_00_14?,B0
+ ADD $X0,$T1,$T1 ; T1 += X[i];
+|| STW $X0,*$Xib++
+|| XOR $t0a,$S0,$S0
+|| XOR $t0e,$S1,$S1
+ XOR $t1a,$S0,$S0 ; Sigma0(a)
+|| XOR $t1e,$S1,$S1 ; Sigma1(e)
+|| LDW *$K256++,$K ; pre-fetch K256[i+1]
+|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
+ ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
+|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
+|| ROTL $G,0,$H ; h = g
+|| MV $F,$G ; g = f
+|| MV $X0,$X14
+|| SWAP4 $Xn,$X0
+ SWAP2 $X0,$X0
+|| MV $E,$F ; f = e
+|| ADD $D,$T1,$E ; e = d + T1
+|| MV $C,$D ; d = c
+ MV $B,$C ; c = b
+|| MV $A,$B ; b = a
+|| ADD $T1,$T2,$A ; a = T1 + T2
+;;===== branch to loop00_14? is taken here
+
+ ROTL $A,30,$S0 ; BODY_15
+|| OR $A,$B,$Maj
+|| AND $A,$B,$t2a
+|| ROTL $E,26,$S1
+|| AND $F,$E,$Ch
+|| ANDN $G,$E,$t2e
+|| LDW *${Xib}[1],$Xn ; modulo-scheduled
+ ROTL $A,19,$t0a
+|| AND $C,$Maj,$Maj
+|| ROTL $E,21,$t0e
+|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
+|| LDW *${Xib}[2],$X1 ; modulo-scheduled
+ ROTL $A,10,$t1a
+|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ROTL $E,7,$t1e
+|| ADD $K,$H,$T1 ; T1 = h + K256[i]
+ ADD $X0,$T1,$T1 ; T1 += X[i];
+|| STW $X0,*$Xib++
+|| XOR $t0a,$S0,$S0
+|| XOR $t0e,$S1,$S1
+ XOR $t1a,$S0,$S0 ; Sigma0(a)
+|| XOR $t1e,$S1,$S1 ; Sigma1(e)
+|| LDW *$K256++,$K ; pre-fetch K256[i+1]
+|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
+ ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
+|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
+|| ROTL $G,0,$H ; h = g
+|| MV $F,$G ; g = f
+|| MV $X0,$X15
+ MV $E,$F ; f = e
+|| ADD $D,$T1,$E ; e = d + T1
+|| MV $C,$D ; d = c
+|| MV $Xn,$X0 ; modulo-scheduled
+|| LDW *$Xia,$X9 ; modulo-scheduled
+|| ROTL $X1,25,$t0e ; modulo-scheduled
+|| ROTL $X14,15,$t0a ; modulo-scheduled
+ SHRU $X1,3,$s0 ; modulo-scheduled
+|| SHRU $X14,10,$s1 ; modulo-scheduled
+|| ROTL $B,0,$C ; c = b
+|| MV $A,$B ; b = a
+|| ADD $T1,$T2,$A ; a = T1 + T2
+
+ MVK 47,B1 ; loop counter
+|| ROTL $X1,14,$t1e ; modulo-scheduled
+|| ROTL $X14,13,$t1a ; modulo-scheduled
+
+loop_16_63?: ; BODY_16_63
+ XOR $t0e,$s0,$s0
+|| XOR $t0a,$s1,$s1
+|| MV $X15,$X14
+|| MV $X1,$Xn
+ XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
+|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
+|| LDW *${Xib}[2],$X1 ; module-scheduled
+ ROTL $A,30,$S0
+|| OR $A,$B,$Maj
+|| AND $A,$B,$t2a
+|| ROTL $E,26,$S1
+|| AND $F,$E,$Ch
+|| ANDN $G,$E,$t2e
+|| ADD $X9,$X0,$X0 ; X[i] += X[i+9]
+ ROTL $A,19,$t0a
+|| AND $C,$Maj,$Maj
+|| ROTL $E,21,$t0e
+|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
+|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
+ ROTL $A,10,$t1a
+|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ROTL $E,7,$t1e
+|| ADD $H,$K,$T1 ; T1 = h + K256[i]
+|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
+|| [B1] BDEC loop_16_63?,B1
+ XOR $t0a,$S0,$S0
+|| XOR $t0e,$S1,$S1
+|| ADD $X0,$T1,$T1 ; T1 += X[i]
+|| STW $X0,*$Xib++
+ XOR $t1a,$S0,$S0 ; Sigma0(a)
+|| XOR $t1e,$S1,$S1 ; Sigma1(e)
+|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
+|| MV $X0,$X15
+|| ROTL $G,0,$H ; h = g
+|| LDW *$K256++,$K ; pre-fetch K256[i+1]
+ ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
+|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
+|| MV $F,$G ; g = f
+|| MV $Xn,$X0 ; modulo-scheduled
+|| LDW *++$Xia,$X9 ; modulo-scheduled
+|| ROTL $X1,25,$t0e ; module-scheduled
+|| ROTL $X14,15,$t0a ; modulo-scheduled
+ ROTL $X1,14,$t1e ; modulo-scheduled
+|| ROTL $X14,13,$t1a ; modulo-scheduled
+|| MV $E,$F ; f = e
+|| ADD $D,$T1,$E ; e = d + T1
+|| MV $C,$D ; d = c
+|| MV $B,$C ; c = b
+ MV $A,$B ; b = a
+|| ADD $T1,$T2,$A ; a = T1 + T2
+|| SHRU $X1,3,$s0 ; modulo-scheduled
+|| SHRU $X14,10,$s1 ; modulo-scheduled
+;;===== branch to loop16_63? is taken here
+
+ [A0] B outerloop?
+|| [A0] LDNW *$INP++,$Xn ; pre-fetch input
+|| [A0] ADDK -260,$K256 ; rewind K256
+|| ADD $Actx,$A,$A ; accumulate ctx
+|| ADD $Ectx,$E,$E
+|| ADD $Bctx,$B,$B
+ ADD $Fctx,$F,$F
+|| ADD $Cctx,$C,$C
+|| ADD $Gctx,$G,$G
+|| ADD $Dctx,$D,$D
+|| ADD $Hctx,$H,$H
+|| [A0] LDW *$K256++,$K ; pre-fetch K256[0]
+
+ [!A0] BNOP RA
+||[!A0] MV $CTXA,$CTXB
+ [!A0] MV FP,SP ; restore stack pointer
+||[!A0] LDW *FP[0],FP ; restore frame pointer
+ [!A0] STW $A,*${CTXA}[0] ; save ctx
+||[!A0] STW $E,*${CTXB}[4]
+||[!A0] MVK 0,B0
+ [!A0] STW $B,*${CTXA}[1]
+||[!A0] STW $F,*${CTXB}[5]
+||[!A0] MVC B0,AMR ; clear AMR
+ STW $C,*${CTXA}[2]
+|| STW $G,*${CTXB}[6]
+ STW $D,*${CTXA}[3]
+|| STW $H,*${CTXB}[7]
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
+ .sect ".const:sha_asm"
+ .endif
+ .align 128
+K256:
+ .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+ .cstring "SHA256 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+
+___
+
+print $code;
diff --git a/crypto/sha/asm/sha512-c64x.pl b/crypto/sha/asm/sha512-c64x.pl
new file mode 100644
index 0000000000..e35a72ade5
--- /dev/null
+++ b/crypto/sha/asm/sha512-c64x.pl
@@ -0,0 +1,437 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x.
+#
+# November 2016
+#
+# Performance is ~19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi, $T2hi)= ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg sha512_block_data_order,_sha512_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg $Khi,KHI
+ .asg $Klo,KLO
+ .else
+ .asg $Khi,KLO
+ .asg $Klo,KHI
+ .endif
+
+ .global _sha512_block_data_order
+_sha512_block_data_order:
+__sha512_block:
+ .asmfunc stack_usage(40+128)
+ MV $NUM,A0 ; reassign $NUM
+|| MVK -128,B0
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] STW FP,*SP--(40) ; save frame pointer
+|| [A0] MV SP,FP
+ [A0] STDW B13:B12,*SP[4]
+|| [A0] MVK 0x00404,B1
+ [A0] STDW B11:B10,*SP[3]
+|| [A0] STDW A13:A12,*FP[-3]
+|| [A0] MVKH 0x60000,B1
+ [A0] STDW A11:A10,*SP[1]
+|| [A0] MVC B1,AMR ; setup circular addressing
+|| [A0] ADD B0,SP,SP ; alloca(128)
+ .if __TI_EABI__
+ [A0] AND B0,SP,SP ; align stack at 128 bytes
+|| [A0] ADDKPC __sha512_block,B1
+|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512
+ [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ .else
+ [A0] AND B0,SP,SP ; align stack at 128 bytes
+|| [A0] ADDKPC __sha512_block,B1
+|| [A0] MVKL (K512-__sha512_block),$K512
+ [A0] MVKH (K512-__sha512_block),$K512
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ .endif
+ ADDAW SP,3,$Xilo
+ ADD SP,4*2,$Xihi ; ADDAW SP,2,$Xihi
+
+|| MV $CTXA,$CTXB
+ LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
+|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo
+|| ADD B1,$K512,$K512
+ LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo
+ LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi
+|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo
+ LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+ LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo
+ LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo
+ LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo
+ LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+ LDNDW *$INP++,B11:B10 ; pre-fetch input
+ LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
+outerloop?:
+ MVK 15,B0 ; loop counters
+|| MVK 64,B1
+|| SUB A0,1,A0
+ MV $Ahi,$Actxhi
+|| MV $Alo,$Actxlo
+|| MV $Bhi,$Bctxhi
+|| MV $Blo,$Bctxlo
+|| MV $Chi,$Cctxhi
+|| MV $Clo,$Cctxlo
+|| MVD $Dhi,$Dctxhi
+|| MVD $Dlo,$Dctxlo
+ MV $Ehi,$Ectxhi
+|| MV $Elo,$Ectxlo
+|| MV $Fhi,$Fctxhi
+|| MV $Flo,$Fctxlo
+|| MV $Ghi,$Gctxhi
+|| MV $Glo,$Gctxlo
+|| MVD $Hhi,$Hctxhi
+|| MVD $Hlo,$Hctxlo
+loop0_15?:
+ .if .BIG_ENDIAN
+ MV B11,$T1hi
+|| MV B10,$T1lo
+ .else
+ SWAP4 B10,$T1hi
+|| SWAP4 B11,$T1lo
+ SWAP2 $T1hi,$T1hi
+|| SWAP2 $T1lo,$T1lo
+ .endif
+ STW $T1hi,*$Xihi++[2] ; original loop16_79?
+|| STW $T1lo,*$Xilo++[2] ; X[i] = T1
+|| ADD $Hhi,$T1hi,$T1hi
+|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
+|| SHRU $Ehi,14,$S1hi
+|| SHL $Ehi,32-14,$S1lo
+loop16_79?:
+ XOR $Fhi,$Ghi,$CHhi
+|| XOR $Flo,$Glo,$CHlo
+|| ADD KHI,$T1hi,$T1hi
+|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i]
+|| SHRU $Elo,14,$t0lo
+|| SHL $Elo,32-14,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| AND $Ehi,$CHhi,$CHhi
+|| AND $Elo,$CHlo,$CHlo
+|| ROTL $Ghi,0,$Hhi
+|| ROTL $Glo,0,$Hlo ; h = g
+|| SHRU $Ehi,18,$t0hi
+|| SHL $Ehi,32-18,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| XOR $Ghi,$CHhi,$CHhi
+|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g
+|| ROTL $Fhi,0,$Ghi
+|| ROTL $Flo,0,$Glo ; g = f
+|| SHRU $Elo,18,$t0lo
+|| SHL $Elo,32-18,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| OR $Ahi,$Bhi,$MAJhi
+|| OR $Alo,$Blo,$MAJlo
+|| ROTL $Ehi,0,$Fhi
+|| ROTL $Elo,0,$Flo ; f = e
+|| SHRU $Ehi,41-32,$t0lo
+|| SHL $Ehi,64-41,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| AND $Chi,$MAJhi,$MAJhi
+|| AND $Clo,$MAJlo,$MAJlo
+|| ROTL $Dhi,0,$Ehi
+|| ROTL $Dlo,0,$Elo ; e = d
+|| SHRU $Elo,41-32,$t0hi
+|| SHL $Elo,64-41,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e)
+|| AND $Ahi,$Bhi,$t1hi
+|| AND $Alo,$Blo,$t1lo
+|| ROTL $Chi,0,$Dhi
+|| ROTL $Clo,0,$Dlo ; d = c
+|| SHRU $Ahi,28,$S0hi
+|| SHL $Ahi,32-28,$S0lo
+ OR $t1hi,$MAJhi,$MAJhi
+|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ADD $CHhi,$T1hi,$T1hi
+|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g)
+|| ROTL $Bhi,0,$Chi
+|| ROTL $Blo,0,$Clo ; c = b
+|| SHRU $Alo,28,$t0lo
+|| SHL $Alo,32-28,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $S1hi,$T1hi,$T1hi
+|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e)
+|| ROTL $Ahi,0,$Bhi
+|| ROTL $Alo,0,$Blo ; b = a
+|| SHRU $Ahi,34-32,$t0lo
+|| SHL $Ahi,64-34,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $MAJhi,$T1hi,$T2hi
+|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c)
+|| SHRU $Alo,34-32,$t0hi
+|| SHL $Alo,64-34,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $Ehi,$T1hi,$T1hi
+|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
+|| SHRU $Ahi,39-32,$t0lo
+|| SHL $Ahi,64-39,$t0hi
+ [B0] BNOP loop0_15?
+|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| SHRU $Alo,39-32,$t0hi
+|| SHL $Alo,64-39,$t0lo
+||[!B0] LDW *${Xihi}[28],$T1hi
+||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
+|| ADD $T1carry,$T1hi,$Ehi
+|| ROTL $T1lo,0,$Elo ; e = T1, "ghost" value
+||[!B1] BNOP break?
+ ADD $S0hi,$T2hi,$T2hi
+|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
+|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
+ NOP ; avoid cross-path stall
+ ADD $T2carry,$T2hi,$Ahi
+|| MV $T2lo,$Alo ; a = T2
+|| [B0] SUB B0,1,B0
+;;===== branch to loop00_15? is taken here
+ [B1] LDW *${Xihi}[2],$T2hi
+|| [B1] LDW *${Xilo}[2],$T2lo ; X[i+1]
+|| [B1] SHRU $T1hi,19,$S1hi
+|| [B1] SHL $T1hi,32-19,$S1lo
+ [B1] SHRU $T1lo,19,$t0lo
+|| [B1] SHL $T1lo,32-19,$t0hi
+;;===== branch to break? is taken here
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1hi,61-32,$t0lo
+|| SHL $T1hi,64-61,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1lo,61-32,$t0hi
+|| SHL $T1lo,64-61,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1hi,6,$t0hi
+|| SHL $T1hi,32-6,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1lo,6,$t0lo
+|| LDW *${Xihi}[18],$T1hi
+|| LDW *${Xilo}[18],$T1lo ; X[i+9]
+ XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14])
+
+|| LDW *${Xihi}[0],$CHhi
+|| LDW *${Xilo}[0],$CHlo ; X[i]
+|| SHRU $T2hi,1,$S0hi
+|| SHL $T2hi,32-1,$S0lo
+ SHRU $T2lo,1,$t0lo
+|| SHL $T2lo,32-1,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| SHRU $T2hi,8,$t0hi
+|| SHL $T2hi,32-8,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| SHRU $T2lo,8,$t0lo
+|| SHL $T2lo,32-8,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $S1hi,$T1hi,$T1hi
+|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
+|| SHRU $T2hi,7,$t0hi
+|| SHL $T2hi,32-7,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $CHhi,$T1hi,$T1hi
+|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
+|| SHRU $T2lo,7,$t0lo
+|| [B1] BNOP loop16_79?
+ XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
+
+ ADD $S0hi,$T1hi,$T1hi
+|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0()
+|| [B1] SUB B1,1,B1
+ NOP ; avoid cross-path stall
+ ADD $T1carry,$T1hi,$T1hi
+
+ STW $T1hi,*$Xihi++[2] ; copied "top" bundle
+|| STW $T1lo,*$Xilo++[2] ; X[i] = T1
+|| ADD $Hhi,$T1hi,$T1hi
+|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
+|| SHRU $Ehi,14,$S1hi
+|| SHL $Ehi,32-14,$S1lo
+;;===== branch to loop16_79? is taken here
+
+break?:
+ ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx
+|| ADDU $Alo,$Actxlo,$Actxlo:$Alo
+|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input
+|| [A0] ADDK -640,$K512 ; rewind pointer to K512
+ ADD $Bhi,$Bctxhi,$Bhi
+|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
+ ADD $Chi,$Cctxhi,$Chi
+|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo
+|| ADD $Actxlo,$Ahi,$Ahi
+||[!A0] MV $CTXA,$CTXB
+ ADD $Dhi,$Dctxhi,$Dhi
+|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo
+|| ADD $Bctxlo,$Bhi,$Bhi
+||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
+||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+ ADD $Ehi,$Ectxhi,$Ehi
+|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo
+|| ADD $Cctxlo,$Chi,$Chi
+|| [A0] BNOP outerloop?
+||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+ ADD $Fhi,$Fctxhi,$Fhi
+|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo
+|| ADD $Dctxlo,$Dhi,$Dhi
+||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+ ADD $Ghi,$Gctxhi,$Ghi
+|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo
+|| ADD $Ectxlo,$Ehi,$Ehi
+||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+ ADD $Hhi,$Hctxhi,$Hhi
+|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo
+|| ADD $Fctxlo,$Fhi,$Fhi
+||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+ ADD $Gctxlo,$Ghi,$Ghi
+||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+ ADD $Hctxlo,$Hhi,$Hhi
+||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+ STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+|| MVK -40,B0
+ ADD FP,B0,SP ; destroy circular buffer
+|| LDDW *FP[-4],A11:A10
+ LDDW *SP[2],A13:A12
+|| LDDW *FP[-2],B11:B10
+ LDDW *SP[4],B13:B12
+|| BNOP RA
+ LDW *++SP(40),FP ; restore frame pointer
+ MVK 0,B0
+ MVC B0,AMR ; clear AMR
+ NOP 2 ; wait till FP is committed
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
+ .sect ".const:sha_asm"
+ .endif
+ .align 128
+K512:
+ .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+ .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+ .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+ .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+ .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+ .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+ .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+ .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+ .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+ .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+ .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+ .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+ .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+ .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+ .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+ .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+ .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+ .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+ .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+ .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+ .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+ .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+ .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+ .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+ .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+ .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+ .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+ .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+ .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+ .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+ .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+ .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+ .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+ .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+ .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+ .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+ .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+ .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+ .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+ .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+ .cstring "SHA512 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;