diff options
author | Andy Polyakov <appro@openssl.org> | 1999-10-07 12:03:59 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 1999-10-07 12:03:59 +0000 |
commit | 2dae04d03880cc88a48700e423ee94f9b94242bd (patch) | |
tree | 13ead4e5a13244bfc7b2c6a0fcb6cc3fe8436d2f /crypto/rc4/rc4_enc.c | |
parent | def38e38ec4fa5795414ee6536f3806f7fe7db13 (diff) | |
download | openssl-2dae04d03880cc88a48700e423ee94f9b94242bd.tar.gz |
RC4 tune-up featuring 30-40% performance improvement on most RISC
platforms. See crypto/rc4/rc4_enc.c for further details.
Diffstat (limited to 'crypto/rc4/rc4_enc.c')
-rw-r--r-- | crypto/rc4/rc4_enc.c | 216 |
1 files changed, 151 insertions, 65 deletions
diff --git a/crypto/rc4/rc4_enc.c b/crypto/rc4/rc4_enc.c index 93a75cd8f9..35dbc7c47e 100644 --- a/crypto/rc4/rc4_enc.c +++ b/crypto/rc4/rc4_enc.c @@ -78,7 +78,7 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, y=key->y; d=key->data; -#if defined(RC4_CHUNK) && (defined(L_ENDIAN) || defined(B_ENDIAN)) +#if defined(RC4_CHUNK) /* * The original reason for implementing this(*) was the fact that * pre-21164a Alpha CPUs don't have byte load/store instructions @@ -87,21 +87,30 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, * at natural word size made it possible to reduce amount of * instructions as well as to perform early read-ahead without * suffering from RAW (read-after-write) hazard. This resulted - * in >40%(**) performance improvement (on 21064 box with gcc). + * in ~40%(**) performance improvement on 21064 box with gcc. * But it's not only Alpha users who win here:-) Thanks to the * early-n-wide read-ahead this implementation also exhibits - * >40% speed-up on SPARC and almost 20% on MIPS. + * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending + * on sizeof(RC4_INT)). * * (*) "this" means code which recognizes the case when input * and output pointers appear to be aligned at natural CPU - * word boundary. + * word boundary * (**) i.e. according to 'apps/openssl speed rc4' benchmark, - * crypto/rc4/rc4speed.c exhibits almost 70% speed-up. + * crypto/rc4/rc4speed.c exhibits almost 70% speed-up... + * + * Cavets. + * + * - RC4_CHUNK="unsigned long long" should be a #1 choice for + * UltraSPARC. Unfortunately gcc generates very slow code + * (2.5-3 times slower than one generated by Sun's WorkShop + * C) and therefore gcc (at least 2.95 and earlier) should + * always be told that RC4_CHUNK="unsigned long". * * <appro@fy.chalmers.se> */ -#define RC4_STEP ( \ +# define RC4_STEP ( \ x=(x+1) &0xff, \ tx=d[x], \ y=(tx+y)&0xff, \ @@ -111,70 +120,148 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, (RC4_CHUNK)d[(tx+ty)&0xff]\ ) -#if defined(L_ENDIAN) -# define SHFT(c) ((c)*8) -# define MASK(i) (((RC4_CHUNK)-1)>>((sizeof(RC4_CHUNK)-(i))<<3)) -# define SHINC 8 -#elif defined(B_ENDIAN) -# define SHFT(c) ((sizeof(RC4_CHUNK)-(c)-1)*8) -# define MASK(i) (((RC4_CHUNK)-1)<<((sizeof(RC4_CHUNK)-(i))<<3)) -# define SHINC -8 -#else -# error "L_ENDIAN or B_ENDIAN *must* be defined!" -#endif - if ( ( ((unsigned long)indata & (sizeof(RC4_CHUNK)-1)) | - ((unsigned long)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 - ) { - RC4_CHUNK ichunk,cipher; + ((unsigned long)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 ) + { + RC4_CHUNK ichunk,otp; + const union { long one; char little; } is_endian = {1}; - for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) { - ichunk = *(RC4_CHUNK *)indata; - cipher = RC4_STEP<<SHFT(0); - cipher |= RC4_STEP<<SHFT(1); - cipher |= RC4_STEP<<SHFT(2); - cipher |= RC4_STEP<<SHFT(3); -#ifdef RC4_CHUNK_IS_64_BIT - cipher |= RC4_STEP<<SHFT(4); - cipher |= RC4_STEP<<SHFT(5); - cipher |= RC4_STEP<<SHFT(6); - cipher |= RC4_STEP<<SHFT(7); -#endif - *(RC4_CHUNK *)outdata = cipher^ichunk; - indata += sizeof(RC4_CHUNK); - outdata += sizeof(RC4_CHUNK); - } - if (len) { - RC4_CHUNK mask,ochunk; + /* + * I reckon we can afford to implement both endian + * cases and to decide which way to take at run-time + * because the machine code appears to be very compact + * and redundant 1-2KB is perfectly tolerable (i.e. + * in case the compiler fails to eliminate it:-). By + * suggestion from Terrel Larson <terr@terralogic.net> + * who also stands for the is_endian union:-) + * + * Special notes. + * + * - is_endian is declared automatic as doing otherwise + * (declaring static) prevents gcc from eliminating + * the redundant code; + * - compilers (those I've tried) don't seem to have + * problems eliminating either the operators guarded + * by "if (sizeof(RC4_CHUNK)==8)" or the condition + * expressions themselves so I've got 'em to replace + * corresponding #ifdefs from the previous version; + * - I chose to let the redundant switch cases when + * sizeof(RC4_CHUNK)!=8 be (were also #ifdefed + * before); + * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in + * [LB]ESHFT guards against "shift is out of range" + * warnings when sizeof(RC4_CHUNK)!=8 + * + * <appro@fy.chalmers.se> + */ + if (!is_endian.little) + { /* BIG-ENDIAN CASE */ +# define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1)) + for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) + { + ichunk = *(RC4_CHUNK *)indata; + otp = RC4_STEP<<BESHFT(0); + otp |= RC4_STEP<<BESHFT(1); + otp |= RC4_STEP<<BESHFT(2); + otp |= RC4_STEP<<BESHFT(3); + if (sizeof(RC4_CHUNK)==8) + { + otp |= RC4_STEP<<BESHFT(4); + otp |= RC4_STEP<<BESHFT(5); + otp |= RC4_STEP<<BESHFT(6); + otp |= RC4_STEP<<BESHFT(7); + } + *(RC4_CHUNK *)outdata = otp^ichunk; + indata += sizeof(RC4_CHUNK); + outdata += sizeof(RC4_CHUNK); + } + if (len) + { + RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk; - ichunk = *(RC4_CHUNK *)indata; - ochunk = *(RC4_CHUNK *)outdata; - cipher = 0; - i = SHFT(0); - mask = MASK(len); - switch (len) { -#ifdef RC4_CHUNK_IS_64_BIT - case 7: cipher = RC4_STEP<<SHFT(0), i+=SHINC; - case 6: cipher |= RC4_STEP<<i, i+=SHINC; - case 5: cipher |= RC4_STEP<<i, i+=SHINC; - case 4: cipher |= RC4_STEP<<i, i+=SHINC; - case 3: cipher |= RC4_STEP<<i, i+=SHINC; -#else - case 3: cipher = RC4_STEP<<SHFT(0), i+=SHINC; -#endif - case 2: cipher |= RC4_STEP<<i, i+=SHINC; - case 1: cipher |= RC4_STEP<<i, i+=SHINC; - case 0: ; /* it's never the case, but it - has to be here for ultrix? */ + ichunk = *(RC4_CHUNK *)indata; + ochunk = *(RC4_CHUNK *)outdata; + otp = 0; + i = BESHFT(0); + mask <<= (sizeof(RC4_CHUNK)-len)<<3; + switch (len&(sizeof(RC4_CHUNK)-1)) + { + case 7: otp = RC4_STEP<<i, i-=8; + case 6: otp |= RC4_STEP<<i, i-=8; + case 5: otp |= RC4_STEP<<i, i-=8; + case 4: otp |= RC4_STEP<<i, i-=8; + case 3: otp |= RC4_STEP<<i, i-=8; + case 2: otp |= RC4_STEP<<i, i-=8; + case 1: otp |= RC4_STEP<<i, i-=8; + case 0: ; /* + * it's never the case, + * but it has to be here + * for ultrix? + */ + } + ochunk &= ~mask; + ochunk |= (otp^ichunk) & mask; + *(RC4_CHUNK *)outdata = ochunk; + } + key->x=x; + key->y=y; + return; + } + else + { /* LITTLE-ENDIAN CASE */ +# define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1)) + for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) + { + ichunk = *(RC4_CHUNK *)indata; + otp = RC4_STEP; + otp |= RC4_STEP<<8; + otp |= RC4_STEP<<16; + otp |= RC4_STEP<<24; + if (sizeof(RC4_CHUNK)==8) + { + otp |= RC4_STEP<<LESHFT(4); + otp |= RC4_STEP<<LESHFT(5); + otp |= RC4_STEP<<LESHFT(6); + otp |= RC4_STEP<<LESHFT(7); + } + *(RC4_CHUNK *)outdata = otp^ichunk; + indata += sizeof(RC4_CHUNK); + outdata += sizeof(RC4_CHUNK); + } + if (len) + { + RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk; + + ichunk = *(RC4_CHUNK *)indata; + ochunk = *(RC4_CHUNK *)outdata; + otp = 0; + i = 0; + mask >>= (sizeof(RC4_CHUNK)-len)<<3; + switch (len&(sizeof(RC4_CHUNK)-1)) + { + case 7: otp = RC4_STEP, i+=8; + case 6: otp |= RC4_STEP<<i, i+=8; + case 5: otp |= RC4_STEP<<i, i+=8; + case 4: otp |= RC4_STEP<<i, i+=8; + case 3: otp |= RC4_STEP<<i, i+=8; + case 2: otp |= RC4_STEP<<i, i+=8; + case 1: otp |= RC4_STEP<<i, i+=8; + case 0: ; /* + * it's never the case, + * but it has to be here + * for ultrix? + */ + } + ochunk &= ~mask; + ochunk |= (otp^ichunk) & mask; + *(RC4_CHUNK *)outdata = ochunk; + } + key->x=x; + key->y=y; + return; } - ochunk &= ~mask; - ochunk |= (cipher^ichunk) & mask; - *(RC4_CHUNK *)outdata = ochunk; } - } - else #endif - { #define LOOP(in,out) \ x=((x+1)&0xff); \ tx=d[x]; \ @@ -223,7 +310,6 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, RC4_LOOP(indata,outdata,6); if (--i == 0) break; } } - } key->x=x; key->y=y; } |