diff options
author | fisher.yu <fisher.yu@arm.com> | 2023-10-17 08:10:34 +0000 |
---|---|---|
committer | Tomas Mraz <tomas@openssl.org> | 2023-11-29 18:10:31 +0100 |
commit | cc82b09cbde0b809d37c23cb1ef9f1f41fc7f959 (patch) | |
tree | ea95cee05b1404d1b47dfb4fa0e571ec7096ff10 | |
parent | c8fe4b5948486e792016208f7c8ccea9c380f354 (diff) | |
download | openssl-cc82b09cbde0b809d37c23cb1ef9f1f41fc7f959.tar.gz |
Optimize AES-CTR for ARM Neoverse V1 and V2.
Unroll AES-CTR loops to a maximum 12 blocks for ARM Neoverse V1 and
V2, to fully utilize their AES pipeline resources.
Improvement on ARM Neoverse V1.
Package Size(Bytes) 16 32 64 128 256 1024
Improvement(%) 3.93 -0.45 11.30 4.31 12.48 37.66
Package Size(Bytes) 1500 8192 16384 61440 65536
Improvement(%) 37.16 38.90 39.89 40.55 40.41
Change-Id: Ifb8fad9af22476259b9ba75132bc3d8010a7fdbd
Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/22733)
-rwxr-xr-x | crypto/aes/asm/aesv8-armx.pl | 758 | ||||
-rw-r--r-- | crypto/arm_arch.h | 1 | ||||
-rw-r--r-- | crypto/armcap.c | 4 | ||||
-rw-r--r-- | include/crypto/aes_platform.h | 11 | ||||
-rw-r--r-- | providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc | 11 | ||||
-rw-r--r-- | providers/implementations/ciphers/cipher_aes_hw.c | 2 | ||||
-rw-r--r-- | providers/implementations/ciphers/cipher_aes_hw_armv8.inc | 34 |
7 files changed, 815 insertions, 6 deletions
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl index dd2c7038e4..699ecfcd21 100755 --- a/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/aes/asm/aesv8-armx.pl @@ -1751,6 +1751,747 @@ $code.=<<___; .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ___ }}} + +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7"); +my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); +my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15)); +my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23)); + +# q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15 +my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3)); +my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9)); +my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15)); +my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21)); +my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27)); +my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27)); + +#q_X => qX, for ldp & stp +my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7)); +my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23)); + +my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11)); + +$code.=<<___ if ($flavour =~ /64/); +.globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3 +.type ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function +.align 5 +${prefix}_ctr32_encrypt_blocks_unroll12_eor3: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] +#ifdef __AARCH64EB__ + vld1.8 {$dat0},[$ivp] +#else + vld1.32 {$dat0},[$ivp] +#endif + vld1.32 {$rndping-$rndpang},[$key] // load key schedule... + sub $rounds,$rounds,#4 + cmp $len,#2 + add $key_,$key,$roundsx,lsl#4 // pointer to last round key + sub $rounds,$rounds,#2 + add $key_, $key_, #64 + vld1.32 {$rndlast},[$key_] + add $key_,$key,#32 + mov $cnt,$rounds +#ifndef __AARCH64EB__ + rev $ctr, $ctr +#endif + + vorr $dat1,$dat0,$dat0 + add $tctr1, $ctr, #1 + vorr $dat2,$dat0,$dat0 + add $ctr, $ctr, #2 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $tctr1 + vmov.32 ${dat1}[3],$tctr1 + b.ls .Lctr32_tail_unroll + cmp $len,#6 + rev $tctr2, $ctr + sub $len,$len,#3 // bias + vmov.32 ${dat2}[3],$tctr2 + b.lo .Loop3x_ctr32_unroll + cmp $len,#9 + vorr $dat3,$dat0,$dat0 + add $tctr3, $ctr, #1 + vorr $dat4,$dat0,$dat0 + add $tctr4, $ctr, #2 + rev $tctr3, $tctr3 + vorr $dat5,$dat0,$dat0 + add $ctr, $ctr, #3 + rev $tctr4, $tctr4 + vmov.32 ${dat3}[3],$tctr3 + rev $tctr5, $ctr + vmov.32 ${dat4}[3],$tctr4 + vmov.32 ${dat5}[3],$tctr5 + sub $len,$len,#3 + b.lo .Loop6x_ctr32_unroll + + // push regs to stack when 12 data chunks are interleaved + stp x19,x20,[sp,#-16]! + stp x21,x22,[sp,#-16]! + stp x23,x24,[sp,#-16]! + stp $dat8d,$dat9d,[sp,#-32]! + stp $dat10d,$dat11d,[sp,#-32]! + + add $tctr6,$ctr,#1 + add $tctr7,$ctr,#2 + add $tctr8,$ctr,#3 + add $tctr9,$ctr,#4 + add $tctr10,$ctr,#5 + add $ctr,$ctr,#6 + vorr $dat6,$dat0,$dat0 + rev $tctr6,$tctr6 + vorr $dat7,$dat0,$dat0 + rev $tctr7,$tctr7 + vorr $dat8,$dat0,$dat0 + rev $tctr8,$tctr8 + vorr $dat9,$dat0,$dat0 + rev $tctr9,$tctr9 + vorr $dat10,$dat0,$dat0 + rev $tctr10,$tctr10 + vorr $dat11,$dat0,$dat0 + rev $tctr11,$ctr + + sub $len,$len,#6 // bias + vmov.32 ${dat6}[3],$tctr6 + vmov.32 ${dat7}[3],$tctr7 + vmov.32 ${dat8}[3],$tctr8 + vmov.32 ${dat9}[3],$tctr9 + vmov.32 ${dat10}[3],$tctr10 + vmov.32 ${dat11}[3],$tctr11 + b .Loop12x_ctr32_unroll + +.align 4 +.Loop12x_ctr32_unroll: + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + aese $dat3,$rndping + aesmc $dat3,$dat3 + aese $dat4,$rndping + aesmc $dat4,$dat4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + aese $dat6,$rndping + aesmc $dat6,$dat6 + aese $dat7,$rndping + aesmc $dat7,$dat7 + aese $dat8,$rndping + aesmc $dat8,$dat8 + aese $dat9,$rndping + aesmc $dat9,$dat9 + aese $dat10,$rndping + aesmc $dat10,$dat10 + aese $dat11,$rndping + aesmc $dat11,$dat11 + vld1.32 {$rndping},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + aese $dat3,$rndpang + aesmc $dat3,$dat3 + aese $dat4,$rndpang + aesmc $dat4,$dat4 + aese $dat5,$rndpang + aesmc $dat5,$dat5 + aese $dat6,$rndpang + aesmc $dat6,$dat6 + aese $dat7,$rndpang + aesmc $dat7,$dat7 + aese $dat8,$rndpang + aesmc $dat8,$dat8 + aese $dat9,$rndpang + aesmc $dat9,$dat9 + aese $dat10,$rndpang + aesmc $dat10,$dat10 + aese $dat11,$rndpang + aesmc $dat11,$dat11 + vld1.32 {$rndpang},[$key_],#16 + b.gt .Loop12x_ctr32_unroll + + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + aese $dat3,$rndping + aesmc $dat3,$dat3 + aese $dat4,$rndping + aesmc $dat4,$dat4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + aese $dat6,$rndping + aesmc $dat6,$dat6 + aese $dat7,$rndping + aesmc $dat7,$dat7 + aese $dat8,$rndping + aesmc $dat8,$dat8 + aese $dat9,$rndping + aesmc $dat9,$dat9 + aese $dat10,$rndping + aesmc $dat10,$dat10 + aese $dat11,$rndping + aesmc $dat11,$dat11 + vld1.32 {$rndping},[$key_],#16 + + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + aese $dat3,$rndpang + aesmc $dat3,$dat3 + aese $dat4,$rndpang + aesmc $dat4,$dat4 + aese $dat5,$rndpang + aesmc $dat5,$dat5 + aese $dat6,$rndpang + aesmc $dat6,$dat6 + aese $dat7,$rndpang + aesmc $dat7,$dat7 + aese $dat8,$rndpang + aesmc $dat8,$dat8 + aese $dat9,$rndpang + aesmc $dat9,$dat9 + aese $dat10,$rndpang + aesmc $dat10,$dat10 + aese $dat11,$rndpang + aesmc $dat11,$dat11 + vld1.32 {$rndpang},[$key_],#16 + + aese $dat0,$rndping + aesmc $dat0,$dat0 + add $tctr0,$ctr,#1 + add $tctr1,$ctr,#2 + aese $dat1,$rndping + aesmc $dat1,$dat1 + add $tctr2,$ctr,#3 + add $tctr3,$ctr,#4 + aese $dat2,$rndping + aesmc $dat2,$dat2 + add $tctr4,$ctr,#5 + add $tctr5,$ctr,#6 + rev $tctr0,$tctr0 + aese $dat3,$rndping + aesmc $dat3,$dat3 + add $tctr6,$ctr,#7 + add $tctr7,$ctr,#8 + rev $tctr1,$tctr1 + rev $tctr2,$tctr2 + aese $dat4,$rndping + aesmc $dat4,$dat4 + add $tctr8,$ctr,#9 + add $tctr9,$ctr,#10 + rev $tctr3,$tctr3 + rev $tctr4,$tctr4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + add $tctr10,$ctr,#11 + add $tctr11,$ctr,#12 + rev $tctr5,$tctr5 + rev $tctr6,$tctr6 + aese $dat6,$rndping + aesmc $dat6,$dat6 + rev $tctr7,$tctr7 + rev $tctr8,$tctr8 + aese $dat7,$rndping + aesmc $dat7,$dat7 + rev $tctr9,$tctr9 + rev $tctr10,$tctr10 + aese $dat8,$rndping + aesmc $dat8,$dat8 + rev $tctr11,$tctr11 + aese $dat9,$rndping + aesmc $dat9,$dat9 + aese $dat10,$rndping + aesmc $dat10,$dat10 + aese $dat11,$rndping + aesmc $dat11,$dat11 + vld1.32 {$rndping},[$key_],#16 + + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + aese $dat3,$rndpang + aesmc $dat3,$dat3 + vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64 + aese $dat4,$rndpang + aesmc $dat4,$dat4 + aese $dat5,$rndpang + aesmc $dat5,$dat5 + aese $dat6,$rndpang + aesmc $dat6,$dat6 + aese $dat7,$rndpang + aesmc $dat7,$dat7 + vld1.8 {$in4,$in5,$in6,$in7},[$inp],#64 + aese $dat8,$rndpang + aesmc $dat8,$dat8 + aese $dat9,$rndpang + aesmc $dat9,$dat9 + aese $dat10,$rndpang + aesmc $dat10,$dat10 + aese $dat11,$rndpang + aesmc $dat11,$dat11 + vld1.8 {$in8,$in9,$in10,$in11},[$inp],#64 + vld1.32 {$rndpang},[$key_],#16 + + mov $key_, $key + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + aese $dat3,$rndping + aesmc $dat3,$dat3 + aese $dat4,$rndping + aesmc $dat4,$dat4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + aese $dat6,$rndping + aesmc $dat6,$dat6 + aese $dat7,$rndping + aesmc $dat7,$dat7 + aese $dat8,$rndping + aesmc $dat8,$dat8 + aese $dat9,$rndping + aesmc $dat9,$dat9 + aese $dat10,$rndping + aesmc $dat10,$dat10 + aese $dat11,$rndping + aesmc $dat11,$dat11 + vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] + + aese $dat0,$rndpang + eor3 $in0,$in0,$rndlast,$dat0 + vorr $dat0,$ivec,$ivec + aese $dat1,$rndpang + eor3 $in1,$in1,$rndlast,$dat1 + vorr $dat1,$ivec,$ivec + aese $dat2,$rndpang + eor3 $in2,$in2,$rndlast,$dat2 + vorr $dat2,$ivec,$ivec + aese $dat3,$rndpang + eor3 $in3,$in3,$rndlast,$dat3 + vorr $dat3,$ivec,$ivec + aese $dat4,$rndpang + eor3 $in4,$in4,$rndlast,$dat4 + vorr $dat4,$ivec,$ivec + aese $dat5,$rndpang + eor3 $in5,$in5,$rndlast,$dat5 + vorr $dat5,$ivec,$ivec + aese $dat6,$rndpang + eor3 $in6,$in6,$rndlast,$dat6 + vorr $dat6,$ivec,$ivec + aese $dat7,$rndpang + eor3 $in7,$in7,$rndlast,$dat7 + vorr $dat7,$ivec,$ivec + aese $dat8,$rndpang + eor3 $in8,$in8,$rndlast,$dat8 + vorr $dat8,$ivec,$ivec + aese $dat9,$rndpang + eor3 $in9,$in9,$rndlast,$dat9 + vorr $dat9,$ivec,$ivec + aese $dat10,$rndpang + eor3 $in10,$in10,$rndlast,$dat10 + vorr $dat10,$ivec,$ivec + aese $dat11,$rndpang + eor3 $in11,$in11,$rndlast,$dat11 + vorr $dat11,$ivec,$ivec + vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] + + vmov.32 ${dat0}[3],$tctr0 + vmov.32 ${dat1}[3],$tctr1 + vmov.32 ${dat2}[3],$tctr2 + vmov.32 ${dat3}[3],$tctr3 + vst1.8 {$in0,$in1,$in2,$in3},[$out],#64 + vmov.32 ${dat4}[3],$tctr4 + vmov.32 ${dat5}[3],$tctr5 + vmov.32 ${dat6}[3],$tctr6 + vmov.32 ${dat7}[3],$tctr7 + vst1.8 {$in4,$in5,$in6,$in7},[$out],#64 + vmov.32 ${dat8}[3],$tctr8 + vmov.32 ${dat9}[3],$tctr9 + vmov.32 ${dat10}[3],$tctr10 + vmov.32 ${dat11}[3],$tctr11 + vst1.8 {$in8,$in9,$in10,$in11},[$out],#64 + + mov $cnt,$rounds + + add $ctr,$ctr,#12 + subs $len,$len,#12 + b.hs .Loop12x_ctr32_unroll + + // pop regs from stack when 12 data chunks are interleaved + ldp $dat10d,$dat11d,[sp],#32 + ldp $dat8d,$dat9d,[sp],#32 + ldp x23,x24,[sp],#16 + ldp x21,x22,[sp],#16 + ldp x19,x20,[sp],#16 + + add $len,$len,#12 + cbz $len,.Lctr32_done_unroll + sub $ctr,$ctr,#12 + + cmp $len,#2 + b.ls .Lctr32_tail_unroll + + cmp $len,#6 + sub $len,$len,#3 // bias + add $ctr,$ctr,#3 + b.lo .Loop3x_ctr32_unroll + + sub $len,$len,#3 + add $ctr,$ctr,#3 + b.lo .Loop6x_ctr32_unroll + +.align 4 +.Loop6x_ctr32_unroll: + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + aese $dat3,$rndping + aesmc $dat3,$dat3 + aese $dat4,$rndping + aesmc $dat4,$dat4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + vld1.32 {$rndping},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + aese $dat3,$rndpang + aesmc $dat3,$dat3 + aese $dat4,$rndpang + aesmc $dat4,$dat4 + aese $dat5,$rndpang + aesmc $dat5,$dat5 + vld1.32 {$rndpang},[$key_],#16 + b.gt .Loop6x_ctr32_unroll + + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + aese $dat3,$rndping + aesmc $dat3,$dat3 + aese $dat4,$rndping + aesmc $dat4,$dat4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + vld1.32 {$rndping},[$key_],#16 + + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + aese $dat3,$rndpang + aesmc $dat3,$dat3 + aese $dat4,$rndpang + aesmc $dat4,$dat4 + aese $dat5,$rndpang + aesmc $dat5,$dat5 + vld1.32 {$rndpang},[$key_],#16 + + aese $dat0,$rndping + aesmc $dat0,$dat0 + add $tctr0,$ctr,#1 + add $tctr1,$ctr,#2 + aese $dat1,$rndping + aesmc $dat1,$dat1 + add $tctr2,$ctr,#3 + add $tctr3,$ctr,#4 + aese $dat2,$rndping + aesmc $dat2,$dat2 + add $tctr4,$ctr,#5 + add $tctr5,$ctr,#6 + rev $tctr0,$tctr0 + aese $dat3,$rndping + aesmc $dat3,$dat3 + rev $tctr1,$tctr1 + rev $tctr2,$tctr2 + aese $dat4,$rndping + aesmc $dat4,$dat4 + rev $tctr3,$tctr3 + rev $tctr4,$tctr4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + rev $tctr5,$tctr5 + vld1.32 {$rndping},[$key_],#16 + + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + aese $dat3,$rndpang + aesmc $dat3,$dat3 + vld1.8 {$in4,$in5},[$inp],#32 + aese $dat4,$rndpang + aesmc $dat4,$dat4 + aese $dat5,$rndpang + aesmc $dat5,$dat5 + vld1.32 {$rndpang},[$key_],#16 + + mov $key_, $key + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + aese $dat3,$rndping + aesmc $dat3,$dat3 + aese $dat4,$rndping + aesmc $dat4,$dat4 + aese $dat5,$rndping + aesmc $dat5,$dat5 + vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] + + aese $dat0,$rndpang + eor3 $in0,$in0,$rndlast,$dat0 + aese $dat1,$rndpang + eor3 $in1,$in1,$rndlast,$dat1 + aese $dat2,$rndpang + eor3 $in2,$in2,$rndlast,$dat2 + aese $dat3,$rndpang + eor3 $in3,$in3,$rndlast,$dat3 + aese $dat4,$rndpang + eor3 $in4,$in4,$rndlast,$dat4 + aese $dat5,$rndpang + eor3 $in5,$in5,$rndlast,$dat5 + vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] + + vorr $dat0,$ivec,$ivec + vorr $dat1,$ivec,$ivec + vorr $dat2,$ivec,$ivec + vorr $dat3,$ivec,$ivec + vorr $dat4,$ivec,$ivec + vorr $dat5,$ivec,$ivec + + vmov.32 ${dat0}[3],$tctr0 + vmov.32 ${dat1}[3],$tctr1 + vst1.8 {$in0,$in1,$in2,$in3},[$out],#64 + vmov.32 ${dat2}[3],$tctr2 + vmov.32 ${dat3}[3],$tctr3 + vst1.8 {$in4,$in5},[$out],#32 + vmov.32 ${dat4}[3],$tctr4 + vmov.32 ${dat5}[3],$tctr5 + + cbz $len,.Lctr32_done_unroll + mov $cnt,$rounds + + cmp $len,#2 + b.ls .Lctr32_tail_unroll + + sub $len,$len,#3 // bias + add $ctr,$ctr,#3 + b .Loop3x_ctr32_unroll + +.align 4 +.Loop3x_ctr32_unroll: + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat2,$rndping + aesmc $dat2,$dat2 + vld1.32 {$rndping},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + aese $dat2,$rndpang + aesmc $dat2,$dat2 + vld1.32 {$rndpang},[$key_],#16 + b.gt .Loop3x_ctr32_unroll + + aese $dat0,$rndping + aesmc $tmp0,$dat0 + aese $dat1,$rndping + aesmc $tmp1,$dat1 + vld1.8 {$in0,$in1,$in2},[$inp],#48 + vorr $dat0,$ivec,$ivec + aese $dat2,$rndping + aesmc $dat2,$dat2 + vld1.32 {$rndping},[$key_],#16 + vorr $dat1,$ivec,$ivec + aese $tmp0,$rndpang + aesmc $tmp0,$tmp0 + aese $tmp1,$rndpang + aesmc $tmp1,$tmp1 + aese $dat2,$rndpang + aesmc $tmp2,$dat2 + vld1.32 {$rndpang},[$key_],#16 + vorr $dat2,$ivec,$ivec + add $tctr0,$ctr,#1 + aese $tmp0,$rndping + aesmc $tmp0,$tmp0 + aese $tmp1,$rndping + aesmc $tmp1,$tmp1 + add $tctr1,$ctr,#2 + aese $tmp2,$rndping + aesmc $tmp2,$tmp2 + vld1.32 {$rndping},[$key_],#16 + add $ctr,$ctr,#3 + aese $tmp0,$rndpang + aesmc $tmp0,$tmp0 + aese $tmp1,$rndpang + aesmc $tmp1,$tmp1 + + rev $tctr0,$tctr0 + aese $tmp2,$rndpang + aesmc $tmp2,$tmp2 + vld1.32 {$rndpang},[$key_],#16 + vmov.32 ${dat0}[3], $tctr0 + mov $key_,$key + rev $tctr1,$tctr1 + aese $tmp0,$rndping + aesmc $tmp0,$tmp0 + + aese $tmp1,$rndping + aesmc $tmp1,$tmp1 + vmov.32 ${dat1}[3], $tctr1 + rev $tctr2,$ctr + aese $tmp2,$rndping + aesmc $tmp2,$tmp2 + vmov.32 ${dat2}[3], $tctr2 + + aese $tmp0,$rndpang + aese $tmp1,$rndpang + aese $tmp2,$rndpang + + eor3 $in0,$in0,$rndlast,$tmp0 + vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] + eor3 $in1,$in1,$rndlast,$tmp1 + mov $cnt,$rounds + eor3 $in2,$in2,$rndlast,$tmp2 + vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] + vst1.8 {$in0,$in1,$in2},[$out],#48 + + cbz $len,.Lctr32_done_unroll + +.Lctr32_tail_unroll: + cmp $len,#1 + b.eq .Lctr32_tail_1_unroll + +.Lctr32_tail_2_unroll: + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + vld1.32 {$rndping},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + vld1.32 {$rndpang},[$key_],#16 + b.gt .Lctr32_tail_2_unroll + + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + vld1.32 {$rndping},[$key_],#16 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + vld1.32 {$rndpang},[$key_],#16 + vld1.8 {$in0,$in1},[$inp],#32 + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + vld1.32 {$rndping},[$key_],#16 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + aese $dat1,$rndpang + aesmc $dat1,$dat1 + vld1.32 {$rndpang},[$key_],#16 + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat1,$rndping + aesmc $dat1,$dat1 + aese $dat0,$rndpang + aese $dat1,$rndpang + + eor3 $in0,$in0,$rndlast,$dat0 + eor3 $in1,$in1,$rndlast,$dat1 + vst1.8 {$in0,$in1},[$out],#32 + b .Lctr32_done_unroll + +.Lctr32_tail_1_unroll: + aese $dat0,$rndping + aesmc $dat0,$dat0 + vld1.32 {$rndping},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + vld1.32 {$rndpang},[$key_],#16 + b.gt .Lctr32_tail_1_unroll + + aese $dat0,$rndping + aesmc $dat0,$dat0 + vld1.32 {$rndping},[$key_],#16 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + vld1.32 {$rndpang},[$key_],#16 + vld1.8 {$in0},[$inp] + aese $dat0,$rndping + aesmc $dat0,$dat0 + vld1.32 {$rndping},[$key_],#16 + aese $dat0,$rndpang + aesmc $dat0,$dat0 + vld1.32 {$rndpang},[$key_],#16 + aese $dat0,$rndping + aesmc $dat0,$dat0 + aese $dat0,$rndpang + + eor3 $in0,$in0,$rndlast,$dat0 + vst1.8 {$in0},[$out],#16 + +.Lctr32_done_unroll: + ldr x29,[sp],#16 + ret +.size ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3 +___ +}}} + {{{ my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my ($rounds,$cnt,$key_)=("w5","w6","x7"); @@ -3640,7 +4381,8 @@ ___ if ($flavour =~ /64/) { ######## 64-bit code my %opcode = ( "aesd" => 0x4e285800, "aese" => 0x4e284800, - "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800, + "eor3" => 0xce000000, ); local *unaes = sub { my ($mnemonic,$arg)=@_; @@ -3651,10 +4393,21 @@ if ($flavour =~ /64/) { ######## 64-bit code $mnemonic,$arg; }; + sub unsha3 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), + $mnemonic,$arg; + } + foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; - s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo; # old->new registers + s/\bq_([0-9]+)\b/"q".$1/geo; # old->new registers s/@\s/\/\//o; # old->new style commentary #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or @@ -3667,6 +4420,7 @@ if ($flavour =~ /64/) { ######## 64-bit code s/vshr/ushr/o or s/^(\s+)v/$1/o or # strip off v prefix s/\bbx\s+lr\b/ret/o; + s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge; # fix up remaining legacy suffixes s/\.[ui]?8//o; diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 83acbe0126..b76981f48b 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -88,6 +88,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; # define ARMV8_SVE (1<<13) # define ARMV8_SVE2 (1<<14) # define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15) +# define ARMV8_UNROLL12_EOR3 (1<<16) /* * MIDR_EL1 system register diff --git a/crypto/armcap.c b/crypto/armcap.c index b13da3be81..adb8b6a188 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -421,6 +421,10 @@ void OPENSSL_cpuid_setup(void) MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V2)) && (OPENSSL_armcap_P & ARMV8_SHA3)) OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; + if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || + MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V2)) && + (OPENSSL_armcap_P & ARMV8_SHA3)) + OPENSSL_armcap_P |= ARMV8_UNROLL12_EOR3; if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) || MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) || MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) || diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h index eb280e754a..8a9e7a0535 100644 --- a/include/crypto/aes_platform.h +++ b/include/crypto/aes_platform.h @@ -107,17 +107,21 @@ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); # define HWAES_cbc_encrypt aes_v8_cbc_encrypt # define HWAES_ecb_encrypt aes_v8_ecb_encrypt # if __ARM_MAX_ARCH__>=8 && (defined(__aarch64__) || defined(_M_ARM64)) +# define ARMv8_HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) # define HWAES_xts_encrypt aes_v8_xts_encrypt # define HWAES_xts_decrypt aes_v8_xts_decrypt # endif # define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks +# define HWAES_ctr32_encrypt_blocks_unroll12_eor3 aes_v8_ctr32_encrypt_blocks_unroll12_eor3 # define AES_PMULL_CAPABLE ((OPENSSL_armcap_P & ARMV8_PMULL) && (OPENSSL_armcap_P & ARMV8_AES)) +# define AES_UNROLL12_EOR3_CAPABLE (OPENSSL_armcap_P & ARMV8_UNROLL12_EOR3) # define AES_GCM_ENC_BYTES 512 # define AES_GCM_DEC_BYTES 512 # if __ARM_MAX_ARCH__>=8 && (defined(__aarch64__) || defined(_M_ARM64)) # define AES_gcm_encrypt armv8_aes_gcm_encrypt # define AES_gcm_decrypt armv8_aes_gcm_decrypt -# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_v8_ctr32_encrypt_blocks && \ +# define AES_GCM_ASM(gctx) (((gctx)->ctr==aes_v8_ctr32_encrypt_blocks_unroll12_eor3 || \ + (gctx)->ctr==aes_v8_ctr32_encrypt_blocks) && \ (gctx)->gcm.funcs.ghash==gcm_ghash_v8) /* The [unroll8_eor3_]aes_gcm_(enc|dec)_(128|192|256)_kernel() functions * take input length in BITS and return number of BYTES processed */ @@ -545,6 +549,11 @@ void HWAES_ecb_encrypt(const unsigned char *in, unsigned char *out, void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t len, const void *key, const unsigned char ivec[16]); +# if defined(AES_UNROLL12_EOR3_CAPABLE) +void HWAES_ctr32_encrypt_blocks_unroll12_eor3(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +# endif void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out, size_t len, const AES_KEY *key1, const AES_KEY *key2, const unsigned char iv[16]); diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc index bdcf670716..cc24071500 100644 --- a/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc +++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc @@ -1,5 +1,5 @@ /* - * Copyright 2019-2022 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -86,8 +86,13 @@ static int armv8_aes_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx; AES_KEY *ks = &actx->ks.ks; - GCM_HW_SET_KEY_CTR_FN(ks, aes_v8_set_encrypt_key, aes_v8_encrypt, - aes_v8_ctr32_encrypt_blocks); + if (AES_UNROLL12_EOR3_CAPABLE) { + GCM_HW_SET_KEY_CTR_FN(ks, aes_v8_set_encrypt_key, aes_v8_encrypt, + aes_v8_ctr32_encrypt_blocks_unroll12_eor3); + } else { + GCM_HW_SET_KEY_CTR_FN(ks, aes_v8_set_encrypt_key, aes_v8_encrypt, + aes_v8_ctr32_encrypt_blocks); + } return 1; } diff --git a/providers/implementations/ciphers/cipher_aes_hw.c b/providers/implementations/ciphers/cipher_aes_hw.c index 0a1243a5fc..e8550a9644 100644 --- a/providers/implementations/ciphers/cipher_aes_hw.c +++ b/providers/implementations/ciphers/cipher_aes_hw.c @@ -146,6 +146,8 @@ const PROV_CIPHER_HW *ossl_prov_cipher_hw_aes_##mode(size_t keybits) \ # include "cipher_aes_hw_rv64i.inc" #elif defined(__riscv) && __riscv_xlen == 32 # include "cipher_aes_hw_rv32i.inc" +#elif defined (ARMv8_HWAES_CAPABLE) +# include "cipher_aes_hw_armv8.inc" #else /* The generic case */ # define PROV_CIPHER_HW_declare(mode) diff --git a/providers/implementations/ciphers/cipher_aes_hw_armv8.inc b/providers/implementations/ciphers/cipher_aes_hw_armv8.inc new file mode 100644 index 0000000000..3f73c79290 --- /dev/null +++ b/providers/implementations/ciphers/cipher_aes_hw_armv8.inc @@ -0,0 +1,34 @@ +/* + * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * Crypto extension support for AES modes ecb, cbc, ofb, cfb, ctr. + * This file is included by cipher_aes_hw.c + */ + +static int cipher_hw_aes_arm_initkey(PROV_CIPHER_CTX *dat, + const unsigned char *key, + size_t keylen) +{ + int ret = cipher_hw_aes_initkey(dat, key, keylen); + if (AES_UNROLL12_EOR3_CAPABLE && dat->mode == EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks_unroll12_eor3; + + return ret; +} + +#define PROV_CIPHER_HW_declare(mode) \ +static const PROV_CIPHER_HW aes_arm_##mode = { \ + cipher_hw_aes_arm_initkey, \ + ossl_cipher_hw_generic_##mode, \ + cipher_hw_aes_copyctx \ +}; +#define PROV_CIPHER_HW_select(mode) \ +if (ARMv8_HWAES_CAPABLE) \ + return &aes_arm_##mode; |