aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn/bn_gf2m.c
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2011-05-04 15:22:53 +0000
committerAndy Polyakov <appro@openssl.org>2011-05-04 15:22:53 +0000
commit034688ec4d0e3d350dc0ee9602552f92e8889fc0 (patch)
tree1faf04231d3d06f0fe70daed229af3105aa40498 /crypto/bn/bn_gf2m.c
parentd16765919d78a576425272adc0c3fb0b493b1198 (diff)
downloadopenssl-034688ec4d0e3d350dc0ee9602552f92e8889fc0.tar.gz
bn_gf2m.c: optimized BN_GF2m_mod_inv delivers sometimes 2x of ECDSA sign.
Exact improvement coefficients vary from one benchmark and platform to another, e.g. it performs 70%-33% better on ARM, hereafter less for longer keys, and 100%-90% better on x86_64.
Diffstat (limited to 'crypto/bn/bn_gf2m.c')
-rw-r--r--crypto/bn/bn_gf2m.c96
1 files changed, 80 insertions, 16 deletions
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index e170fffc6c..6caf2888a6 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -364,21 +364,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
{
int ret = 0;
- const int max = BN_num_bits(p) + 1;
- int *arr=NULL;
+ int arr[6];
bn_check_top(a);
bn_check_top(p);
- if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
- ret = BN_GF2m_poly2arr(p, arr, max);
- if (!ret || ret > max)
+ ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+ if (!ret || ret > sizeof(arr)/sizeof(arr[0]))
{
BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
- goto err;
+ return 0;
}
ret = BN_GF2m_mod_arr(r, a, arr);
bn_check_top(r);
-err:
- if (arr) OPENSSL_free(arr);
return ret;
}
@@ -533,18 +529,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
BN_CTX_start(ctx);
- b = BN_CTX_get(ctx);
- c = BN_CTX_get(ctx);
- u = BN_CTX_get(ctx);
- v = BN_CTX_get(ctx);
- if (v == NULL) goto err;
+ if ((b = BN_CTX_get(ctx))==NULL) goto err;
+ if ((c = BN_CTX_get(ctx))==NULL) goto err;
+ if ((u = BN_CTX_get(ctx))==NULL) goto err;
+ if ((v = BN_CTX_get(ctx))==NULL) goto err;
- if (!BN_one(b)) goto err;
if (!BN_GF2m_mod(u, a, p)) goto err;
- if (!BN_copy(v, p)) goto err;
-
if (BN_is_zero(u)) goto err;
+ if (!BN_copy(v, p)) goto err;
+#if 0
+ if (!BN_one(b)) goto err;
+
while (1)
{
while (!BN_is_odd(u))
@@ -568,7 +564,75 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
if (!BN_GF2m_add(u, u, v)) goto err;
if (!BN_GF2m_add(b, b, c)) goto err;
}
+#else
+ {
+ int i, ubits = BN_num_bits(u),
+ vbits = BN_num_bits(v), /* v is copy of p */
+ top = p->top;
+ BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+ bn_wexpand(u,top); udp = u->d;
+ for (i=u->top;i<top;i++) udp[i] = 0;
+ u->top = top;
+ bn_wexpand(b,top); bdp = b->d;
+ bdp[0] = 1;
+ for (i=1;i<top;i++) bdp[i] = 0;
+ b->top = top;
+ bn_wexpand(c,top); cdp = c->d;
+ for (i=0;i<top;i++) cdp[i] = 0;
+ c->top = top;
+ vdp = v->d; /* It pays off to "cache" *->d pointers, because
+ * it allows optimizer to be more aggressive.
+ * But we don't have to "cache" p->d, because *p
+ * is declared 'const'... */
+ while (1)
+ {
+ while (ubits && !(udp[0]&1))
+ {
+ BN_ULONG u0,u1,b0,b1,mask;
+
+ u0 = udp[0];
+ b0 = bdp[0];
+ mask = (BN_ULONG)0-(b0&1);
+ b0 ^= p->d[0]&mask;
+ for (i=0;i<top-1;i++)
+ {
+ u1 = udp[i+1];
+ udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+ u0 = u1;
+ b1 = bdp[i+1]^(p->d[i+1]&mask);
+ bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+ b0 = b1;
+ }
+ udp[i] = u0>>1;
+ bdp[i] = b0>>1;
+ ubits--;
+ }
+ if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+ if (ubits<vbits)
+ {
+ i = ubits; ubits = vbits; vbits = i;
+ tmp = u; u = v; v = tmp;
+ tmp = b; b = c; c = tmp;
+ udp = vdp; vdp = v->d;
+ bdp = cdp; cdp = c->d;
+ }
+ for(i=0;i<top;i++)
+ {
+ udp[i] ^= vdp[i];
+ bdp[i] ^= cdp[i];
+ }
+ if (ubits==vbits)
+ {
+ bn_fix_top(u);
+ ubits = BN_num_bits(u);
+ }
+ }
+ bn_fix_top(b);
+ }
+#endif
if (!BN_copy(r, b)) goto err;
bn_check_top(r);