ssl/libcrypto/src/crypto/bn/bn_asm.c
changeset 0 e4d67989cc36
equal deleted inserted replaced
-1:000000000000 0:e4d67989cc36
       
     1 /* crypto/bn/bn_asm.c */
       
     2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
       
     3  * All rights reserved.
       
     4  *
       
     5  * This package is an SSL implementation written
       
     6  * by Eric Young (eay@cryptsoft.com).
       
     7  * The implementation was written so as to conform with Netscapes SSL.
       
     8  * 
       
     9  * This library is free for commercial and non-commercial use as long as
       
    10  * the following conditions are aheared to.  The following conditions
       
    11  * apply to all code found in this distribution, be it the RC4, RSA,
       
    12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
       
    13  * included with this distribution is covered by the same copyright terms
       
    14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
       
    15  * 
       
    16  * Copyright remains Eric Young's, and as such any Copyright notices in
       
    17  * the code are not to be removed.
       
    18  * If this package is used in a product, Eric Young should be given attribution
       
    19  * as the author of the parts of the library used.
       
    20  * This can be in the form of a textual message at program startup or
       
    21  * in documentation (online or textual) provided with the package.
       
    22  * 
       
    23  * Redistribution and use in source and binary forms, with or without
       
    24  * modification, are permitted provided that the following conditions
       
    25  * are met:
       
    26  * 1. Redistributions of source code must retain the copyright
       
    27  *    notice, this list of conditions and the following disclaimer.
       
    28  * 2. Redistributions in binary form must reproduce the above copyright
       
    29  *    notice, this list of conditions and the following disclaimer in the
       
    30  *    documentation and/or other materials provided with the distribution.
       
    31  * 3. All advertising materials mentioning features or use of this software
       
    32  *    must display the following acknowledgement:
       
    33  *    "This product includes cryptographic software written by
       
    34  *     Eric Young (eay@cryptsoft.com)"
       
    35  *    The word 'cryptographic' can be left out if the rouines from the library
       
    36  *    being used are not cryptographic related :-).
       
    37  * 4. If you include any Windows specific code (or a derivative thereof) from 
       
    38  *    the apps directory (application code) you must include an acknowledgement:
       
    39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
       
    40  * 
       
    41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
       
    42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
       
    45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    51  * SUCH DAMAGE.
       
    52  * 
       
    53  * The licence and distribution terms for any publically available version or
       
    54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
       
    55  * copied and put under another distribution licence
       
    56  * [including the GNU Public Licence.]
       
    57  */
       
    58 
       
    59 #ifndef BN_DEBUG
       
    60 # undef NDEBUG /* avoid conflicting definitions */
       
    61 # define NDEBUG
       
    62 #endif
       
    63 
       
    64 #include <stdio.h>
       
    65 #include <assert.h>
       
    66 #include "cryptlib.h"
       
    67 #include "bn_lcl.h"
       
    68 
       
    69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
       
    70 
       
    71 EXPORT_C BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
       
    72 	{
       
    73 	BN_ULONG c1=0;
       
    74 
       
    75 	assert(num >= 0);
       
    76 	if (num <= 0) return(c1);
       
    77 
       
    78 	while (num&~3)
       
    79 		{
       
    80 		mul_add(rp[0],ap[0],w,c1);
       
    81 		mul_add(rp[1],ap[1],w,c1);
       
    82 		mul_add(rp[2],ap[2],w,c1);
       
    83 		mul_add(rp[3],ap[3],w,c1);
       
    84 		ap+=4; rp+=4; num-=4;
       
    85 		}
       
    86 	if (num)
       
    87 		{
       
    88 		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
       
    89 		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
       
    90 		mul_add(rp[2],ap[2],w,c1); return c1;
       
    91 		}
       
    92 	
       
    93 	return(c1);
       
    94 	} 
       
    95 
       
    96 EXPORT_C BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
       
    97 	{
       
    98 	BN_ULONG c1=0;
       
    99 
       
   100 	assert(num >= 0);
       
   101 	if (num <= 0) return(c1);
       
   102 
       
   103 	while (num&~3)
       
   104 		{
       
   105 		mul(rp[0],ap[0],w,c1);
       
   106 		mul(rp[1],ap[1],w,c1);
       
   107 		mul(rp[2],ap[2],w,c1);
       
   108 		mul(rp[3],ap[3],w,c1);
       
   109 		ap+=4; rp+=4; num-=4;
       
   110 		}
       
   111 	if (num)
       
   112 		{
       
   113 		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
       
   114 		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
       
   115 		mul(rp[2],ap[2],w,c1);
       
   116 		}
       
   117 	return(c1);
       
   118 	} 
       
   119 
       
   120 EXPORT_C void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
       
   121         {
       
   122 	assert(n >= 0);
       
   123 	if (n <= 0) return;
       
   124 	while (n&~3)
       
   125 		{
       
   126 		sqr(r[0],r[1],a[0]);
       
   127 		sqr(r[2],r[3],a[1]);
       
   128 		sqr(r[4],r[5],a[2]);
       
   129 		sqr(r[6],r[7],a[3]);
       
   130 		a+=4; r+=8; n-=4;
       
   131 		}
       
   132 	if (n)
       
   133 		{
       
   134 		sqr(r[0],r[1],a[0]); if (--n == 0) return;
       
   135 		sqr(r[2],r[3],a[1]); if (--n == 0) return;
       
   136 		sqr(r[4],r[5],a[2]);
       
   137 		}
       
   138 	}
       
   139 
       
   140 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
       
   141 
       
   142 EXPORT_C BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
       
   143 	{
       
   144 	BN_ULONG c=0;
       
   145 	BN_ULONG bl,bh;
       
   146 
       
   147 	assert(num >= 0);
       
   148 	if (num <= 0) return((BN_ULONG)0);
       
   149 
       
   150 	bl=LBITS(w);
       
   151 	bh=HBITS(w);
       
   152 
       
   153 	for (;;)
       
   154 		{
       
   155 		mul_add(rp[0],ap[0],bl,bh,c);
       
   156 		if (--num == 0) break;
       
   157 		mul_add(rp[1],ap[1],bl,bh,c);
       
   158 		if (--num == 0) break;
       
   159 		mul_add(rp[2],ap[2],bl,bh,c);
       
   160 		if (--num == 0) break;
       
   161 		mul_add(rp[3],ap[3],bl,bh,c);
       
   162 		if (--num == 0) break;
       
   163 		ap+=4;
       
   164 		rp+=4;
       
   165 		}
       
   166 	return(c);
       
   167 	} 
       
   168 
       
   169 EXPORT_C BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
       
   170 	{
       
   171 	BN_ULONG carry=0;
       
   172 	BN_ULONG bl,bh;
       
   173 
       
   174 	assert(num >= 0);
       
   175 	if (num <= 0) return((BN_ULONG)0);
       
   176 
       
   177 	bl=LBITS(w);
       
   178 	bh=HBITS(w);
       
   179 
       
   180 	for (;;)
       
   181 		{
       
   182 		mul(rp[0],ap[0],bl,bh,carry);
       
   183 		if (--num == 0) break;
       
   184 		mul(rp[1],ap[1],bl,bh,carry);
       
   185 		if (--num == 0) break;
       
   186 		mul(rp[2],ap[2],bl,bh,carry);
       
   187 		if (--num == 0) break;
       
   188 		mul(rp[3],ap[3],bl,bh,carry);
       
   189 		if (--num == 0) break;
       
   190 		ap+=4;
       
   191 		rp+=4;
       
   192 		}
       
   193 	return(carry);
       
   194 	} 
       
   195 
       
   196 EXPORT_C void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
       
   197         {
       
   198 	assert(n >= 0);
       
   199 	if (n <= 0) return;
       
   200 	for (;;)
       
   201 		{
       
   202 		sqr64(r[0],r[1],a[0]);
       
   203 		if (--n == 0) break;
       
   204 
       
   205 		sqr64(r[2],r[3],a[1]);
       
   206 		if (--n == 0) break;
       
   207 
       
   208 		sqr64(r[4],r[5],a[2]);
       
   209 		if (--n == 0) break;
       
   210 
       
   211 		sqr64(r[6],r[7],a[3]);
       
   212 		if (--n == 0) break;
       
   213 
       
   214 		a+=4;
       
   215 		r+=8;
       
   216 		}
       
   217 	}
       
   218 
       
   219 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
       
   220 
       
   221 #if defined(BN_LLONG) && defined(BN_DIV2W)
       
   222 
       
   223 EXPORT_C BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
       
   224 	{
       
   225 	return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
       
   226 	}
       
   227 
       
   228 #else
       
   229 
       
   230 /* Divide h,l by d and return the result. */
       
   231 /* I need to test this some more :-( */
       
   232 EXPORT_C BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
       
   233 	{
       
   234 	BN_ULONG dh,dl,q,ret=0,th,tl,t;
       
   235 	int i,count=2;
       
   236 
       
   237 	if (d == 0) return(BN_MASK2);
       
   238 
       
   239 	i=BN_num_bits_word(d);
       
   240 	assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
       
   241 
       
   242 	i=BN_BITS2-i;
       
   243 	if (h >= d) h-=d;
       
   244 
       
   245 	if (i)
       
   246 		{
       
   247 		d<<=i;
       
   248 		h=(h<<i)|(l>>(BN_BITS2-i));
       
   249 		l<<=i;
       
   250 		}
       
   251 	dh=(d&BN_MASK2h)>>BN_BITS4;
       
   252 	dl=(d&BN_MASK2l);
       
   253 	for (;;)
       
   254 		{
       
   255 		if ((h>>BN_BITS4) == dh)
       
   256 			q=BN_MASK2l;
       
   257 		else
       
   258 			q=h/dh;
       
   259 
       
   260 		th=q*dh;
       
   261 		tl=dl*q;
       
   262 		for (;;)
       
   263 			{
       
   264 			t=h-th;
       
   265 			if ((t&BN_MASK2h) ||
       
   266 				((tl) <= (
       
   267 					(t<<BN_BITS4)|
       
   268 					((l&BN_MASK2h)>>BN_BITS4))))
       
   269 				break;
       
   270 			q--;
       
   271 			th-=dh;
       
   272 			tl-=dl;
       
   273 			}
       
   274 		t=(tl>>BN_BITS4);
       
   275 		tl=(tl<<BN_BITS4)&BN_MASK2h;
       
   276 		th+=t;
       
   277 
       
   278 		if (l < tl) th++;
       
   279 		l-=tl;
       
   280 		if (h < th)
       
   281 			{
       
   282 			h+=d;
       
   283 			q--;
       
   284 			}
       
   285 		h-=th;
       
   286 
       
   287 		if (--count == 0) break;
       
   288 
       
   289 		ret=q<<BN_BITS4;
       
   290 		h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
       
   291 		l=(l&BN_MASK2l)<<BN_BITS4;
       
   292 		}
       
   293 	ret|=q;
       
   294 	return(ret);
       
   295 	}
       
   296 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
       
   297 
       
   298 #ifdef BN_LLONG
       
   299 EXPORT_C BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
       
   300         {
       
   301 	BN_ULLONG ll=0;
       
   302 
       
   303 	assert(n >= 0);
       
   304 	if (n <= 0) return((BN_ULONG)0);
       
   305 
       
   306 	for (;;)
       
   307 		{
       
   308 		ll+=(BN_ULLONG)a[0]+b[0];
       
   309 		r[0]=(BN_ULONG)ll&BN_MASK2;
       
   310 		ll>>=BN_BITS2;
       
   311 		if (--n <= 0) break;
       
   312 
       
   313 		ll+=(BN_ULLONG)a[1]+b[1];
       
   314 		r[1]=(BN_ULONG)ll&BN_MASK2;
       
   315 		ll>>=BN_BITS2;
       
   316 		if (--n <= 0) break;
       
   317 
       
   318 		ll+=(BN_ULLONG)a[2]+b[2];
       
   319 		r[2]=(BN_ULONG)ll&BN_MASK2;
       
   320 		ll>>=BN_BITS2;
       
   321 		if (--n <= 0) break;
       
   322 
       
   323 		ll+=(BN_ULLONG)a[3]+b[3];
       
   324 		r[3]=(BN_ULONG)ll&BN_MASK2;
       
   325 		ll>>=BN_BITS2;
       
   326 		if (--n <= 0) break;
       
   327 
       
   328 		a+=4;
       
   329 		b+=4;
       
   330 		r+=4;
       
   331 		}
       
   332 	return((BN_ULONG)ll);
       
   333 	}
       
   334 #else /* !BN_LLONG */
       
   335 EXPORT_C BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
       
   336         {
       
   337 	BN_ULONG c,l,t;
       
   338 
       
   339 	assert(n >= 0);
       
   340 	if (n <= 0) return((BN_ULONG)0);
       
   341 
       
   342 	c=0;
       
   343 	for (;;)
       
   344 		{
       
   345 		t=a[0];
       
   346 		t=(t+c)&BN_MASK2;
       
   347 		c=(t < c);
       
   348 		l=(t+b[0])&BN_MASK2;
       
   349 		c+=(l < t);
       
   350 		r[0]=l;
       
   351 		if (--n <= 0) break;
       
   352 
       
   353 		t=a[1];
       
   354 		t=(t+c)&BN_MASK2;
       
   355 		c=(t < c);
       
   356 		l=(t+b[1])&BN_MASK2;
       
   357 		c+=(l < t);
       
   358 		r[1]=l;
       
   359 		if (--n <= 0) break;
       
   360 
       
   361 		t=a[2];
       
   362 		t=(t+c)&BN_MASK2;
       
   363 		c=(t < c);
       
   364 		l=(t+b[2])&BN_MASK2;
       
   365 		c+=(l < t);
       
   366 		r[2]=l;
       
   367 		if (--n <= 0) break;
       
   368 
       
   369 		t=a[3];
       
   370 		t=(t+c)&BN_MASK2;
       
   371 		c=(t < c);
       
   372 		l=(t+b[3])&BN_MASK2;
       
   373 		c+=(l < t);
       
   374 		r[3]=l;
       
   375 		if (--n <= 0) break;
       
   376 
       
   377 		a+=4;
       
   378 		b+=4;
       
   379 		r+=4;
       
   380 		}
       
   381 	return((BN_ULONG)c);
       
   382 	}
       
   383 #endif /* !BN_LLONG */
       
   384 
       
   385 EXPORT_C BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
       
   386         {
       
   387 	BN_ULONG t1,t2;
       
   388 	int c=0;
       
   389 
       
   390 	assert(n >= 0);
       
   391 	if (n <= 0) return((BN_ULONG)0);
       
   392 
       
   393 	for (;;)
       
   394 		{
       
   395 		t1=a[0]; t2=b[0];
       
   396 		r[0]=(t1-t2-c)&BN_MASK2;
       
   397 		if (t1 != t2) c=(t1 < t2);
       
   398 		if (--n <= 0) break;
       
   399 
       
   400 		t1=a[1]; t2=b[1];
       
   401 		r[1]=(t1-t2-c)&BN_MASK2;
       
   402 		if (t1 != t2) c=(t1 < t2);
       
   403 		if (--n <= 0) break;
       
   404 
       
   405 		t1=a[2]; t2=b[2];
       
   406 		r[2]=(t1-t2-c)&BN_MASK2;
       
   407 		if (t1 != t2) c=(t1 < t2);
       
   408 		if (--n <= 0) break;
       
   409 
       
   410 		t1=a[3]; t2=b[3];
       
   411 		r[3]=(t1-t2-c)&BN_MASK2;
       
   412 		if (t1 != t2) c=(t1 < t2);
       
   413 		if (--n <= 0) break;
       
   414 
       
   415 		a+=4;
       
   416 		b+=4;
       
   417 		r+=4;
       
   418 		}
       
   419 	return(c);
       
   420 	}
       
   421 
       
   422 #ifdef BN_MUL_COMBA
       
   423 
       
   424 #undef bn_mul_comba8
       
   425 #undef bn_mul_comba4
       
   426 #undef bn_sqr_comba8
       
   427 #undef bn_sqr_comba4
       
   428 
       
   429 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
       
   430 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
       
   431 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
       
   432 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
       
   433 
       
   434 #ifdef BN_LLONG
       
   435 #define mul_add_c(a,b,c0,c1,c2) \
       
   436 	t=(BN_ULLONG)a*b; \
       
   437 	t1=(BN_ULONG)Lw(t); \
       
   438 	t2=(BN_ULONG)Hw(t); \
       
   439 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
       
   440 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
       
   441 
       
   442 #define mul_add_c2(a,b,c0,c1,c2) \
       
   443 	t=(BN_ULLONG)a*b; \
       
   444 	tt=(t+t)&BN_MASK; \
       
   445 	if (tt < t) c2++; \
       
   446 	t1=(BN_ULONG)Lw(tt); \
       
   447 	t2=(BN_ULONG)Hw(tt); \
       
   448 	c0=(c0+t1)&BN_MASK2;  \
       
   449 	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
       
   450 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
       
   451 
       
   452 #define sqr_add_c(a,i,c0,c1,c2) \
       
   453 	t=(BN_ULLONG)a[i]*a[i]; \
       
   454 	t1=(BN_ULONG)Lw(t); \
       
   455 	t2=(BN_ULONG)Hw(t); \
       
   456 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
       
   457 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
       
   458 
       
   459 #define sqr_add_c2(a,i,j,c0,c1,c2) \
       
   460 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
       
   461 
       
   462 #elif defined(BN_UMULT_LOHI)
       
   463 
       
   464 #define mul_add_c(a,b,c0,c1,c2)	{	\
       
   465 	BN_ULONG ta=(a),tb=(b);		\
       
   466 	BN_UMULT_LOHI(t1,t2,ta,tb);	\
       
   467 	c0 += t1; t2 += (c0<t1)?1:0;	\
       
   468 	c1 += t2; c2 += (c1<t2)?1:0;	\
       
   469 	}
       
   470 
       
   471 #define mul_add_c2(a,b,c0,c1,c2) {	\
       
   472 	BN_ULONG ta=(a),tb=(b),t0;	\
       
   473 	BN_UMULT_LOHI(t0,t1,ta,tb);	\
       
   474 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
       
   475 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
       
   476 	c0 += t1; t2 += (c0<t1)?1:0;	\
       
   477 	c1 += t2; c2 += (c1<t2)?1:0;	\
       
   478 	}
       
   479 
       
   480 #define sqr_add_c(a,i,c0,c1,c2)	{	\
       
   481 	BN_ULONG ta=(a)[i];		\
       
   482 	BN_UMULT_LOHI(t1,t2,ta,ta);	\
       
   483 	c0 += t1; t2 += (c0<t1)?1:0;	\
       
   484 	c1 += t2; c2 += (c1<t2)?1:0;	\
       
   485 	}
       
   486 
       
   487 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
       
   488 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
       
   489 
       
   490 #elif defined(BN_UMULT_HIGH)
       
   491 
       
   492 #define mul_add_c(a,b,c0,c1,c2)	{	\
       
   493 	BN_ULONG ta=(a),tb=(b);		\
       
   494 	t1 = ta * tb;			\
       
   495 	t2 = BN_UMULT_HIGH(ta,tb);	\
       
   496 	c0 += t1; t2 += (c0<t1)?1:0;	\
       
   497 	c1 += t2; c2 += (c1<t2)?1:0;	\
       
   498 	}
       
   499 
       
   500 #define mul_add_c2(a,b,c0,c1,c2) {	\
       
   501 	BN_ULONG ta=(a),tb=(b),t0;	\
       
   502 	t1 = BN_UMULT_HIGH(ta,tb);	\
       
   503 	t0 = ta * tb;			\
       
   504 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
       
   505 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
       
   506 	c0 += t1; t2 += (c0<t1)?1:0;	\
       
   507 	c1 += t2; c2 += (c1<t2)?1:0;	\
       
   508 	}
       
   509 
       
   510 #define sqr_add_c(a,i,c0,c1,c2)	{	\
       
   511 	BN_ULONG ta=(a)[i];		\
       
   512 	t1 = ta * ta;			\
       
   513 	t2 = BN_UMULT_HIGH(ta,ta);	\
       
   514 	c0 += t1; t2 += (c0<t1)?1:0;	\
       
   515 	c1 += t2; c2 += (c1<t2)?1:0;	\
       
   516 	}
       
   517 
       
   518 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
       
   519 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
       
   520 
       
   521 #else /* !BN_LLONG */
       
   522 #define mul_add_c(a,b,c0,c1,c2) \
       
   523 	t1=LBITS(a); t2=HBITS(a); \
       
   524 	bl=LBITS(b); bh=HBITS(b); \
       
   525 	mul64(t1,t2,bl,bh); \
       
   526 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
       
   527 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
       
   528 
       
   529 #define mul_add_c2(a,b,c0,c1,c2) \
       
   530 	t1=LBITS(a); t2=HBITS(a); \
       
   531 	bl=LBITS(b); bh=HBITS(b); \
       
   532 	mul64(t1,t2,bl,bh); \
       
   533 	if (t2 & BN_TBIT) c2++; \
       
   534 	t2=(t2+t2)&BN_MASK2; \
       
   535 	if (t1 & BN_TBIT) t2++; \
       
   536 	t1=(t1+t1)&BN_MASK2; \
       
   537 	c0=(c0+t1)&BN_MASK2;  \
       
   538 	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
       
   539 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
       
   540 
       
   541 #define sqr_add_c(a,i,c0,c1,c2) \
       
   542 	sqr64(t1,t2,(a)[i]); \
       
   543 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
       
   544 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
       
   545 
       
   546 #define sqr_add_c2(a,i,j,c0,c1,c2) \
       
   547 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
       
   548 #endif /* !BN_LLONG */
       
   549 
       
   550 EXPORT_C void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
       
   551 	{
       
   552 #ifdef BN_LLONG
       
   553 	BN_ULLONG t;
       
   554 #else
       
   555 	BN_ULONG bl,bh;
       
   556 #endif
       
   557 	BN_ULONG t1,t2;
       
   558 	BN_ULONG c1,c2,c3;
       
   559 
       
   560 	c1=0;
       
   561 	c2=0;
       
   562 	c3=0;
       
   563 	mul_add_c(a[0],b[0],c1,c2,c3);
       
   564 	r[0]=c1;
       
   565 	c1=0;
       
   566 	mul_add_c(a[0],b[1],c2,c3,c1);
       
   567 	mul_add_c(a[1],b[0],c2,c3,c1);
       
   568 	r[1]=c2;
       
   569 	c2=0;
       
   570 	mul_add_c(a[2],b[0],c3,c1,c2);
       
   571 	mul_add_c(a[1],b[1],c3,c1,c2);
       
   572 	mul_add_c(a[0],b[2],c3,c1,c2);
       
   573 	r[2]=c3;
       
   574 	c3=0;
       
   575 	mul_add_c(a[0],b[3],c1,c2,c3);
       
   576 	mul_add_c(a[1],b[2],c1,c2,c3);
       
   577 	mul_add_c(a[2],b[1],c1,c2,c3);
       
   578 	mul_add_c(a[3],b[0],c1,c2,c3);
       
   579 	r[3]=c1;
       
   580 	c1=0;
       
   581 	mul_add_c(a[4],b[0],c2,c3,c1);
       
   582 	mul_add_c(a[3],b[1],c2,c3,c1);
       
   583 	mul_add_c(a[2],b[2],c2,c3,c1);
       
   584 	mul_add_c(a[1],b[3],c2,c3,c1);
       
   585 	mul_add_c(a[0],b[4],c2,c3,c1);
       
   586 	r[4]=c2;
       
   587 	c2=0;
       
   588 	mul_add_c(a[0],b[5],c3,c1,c2);
       
   589 	mul_add_c(a[1],b[4],c3,c1,c2);
       
   590 	mul_add_c(a[2],b[3],c3,c1,c2);
       
   591 	mul_add_c(a[3],b[2],c3,c1,c2);
       
   592 	mul_add_c(a[4],b[1],c3,c1,c2);
       
   593 	mul_add_c(a[5],b[0],c3,c1,c2);
       
   594 	r[5]=c3;
       
   595 	c3=0;
       
   596 	mul_add_c(a[6],b[0],c1,c2,c3);
       
   597 	mul_add_c(a[5],b[1],c1,c2,c3);
       
   598 	mul_add_c(a[4],b[2],c1,c2,c3);
       
   599 	mul_add_c(a[3],b[3],c1,c2,c3);
       
   600 	mul_add_c(a[2],b[4],c1,c2,c3);
       
   601 	mul_add_c(a[1],b[5],c1,c2,c3);
       
   602 	mul_add_c(a[0],b[6],c1,c2,c3);
       
   603 	r[6]=c1;
       
   604 	c1=0;
       
   605 	mul_add_c(a[0],b[7],c2,c3,c1);
       
   606 	mul_add_c(a[1],b[6],c2,c3,c1);
       
   607 	mul_add_c(a[2],b[5],c2,c3,c1);
       
   608 	mul_add_c(a[3],b[4],c2,c3,c1);
       
   609 	mul_add_c(a[4],b[3],c2,c3,c1);
       
   610 	mul_add_c(a[5],b[2],c2,c3,c1);
       
   611 	mul_add_c(a[6],b[1],c2,c3,c1);
       
   612 	mul_add_c(a[7],b[0],c2,c3,c1);
       
   613 	r[7]=c2;
       
   614 	c2=0;
       
   615 	mul_add_c(a[7],b[1],c3,c1,c2);
       
   616 	mul_add_c(a[6],b[2],c3,c1,c2);
       
   617 	mul_add_c(a[5],b[3],c3,c1,c2);
       
   618 	mul_add_c(a[4],b[4],c3,c1,c2);
       
   619 	mul_add_c(a[3],b[5],c3,c1,c2);
       
   620 	mul_add_c(a[2],b[6],c3,c1,c2);
       
   621 	mul_add_c(a[1],b[7],c3,c1,c2);
       
   622 	r[8]=c3;
       
   623 	c3=0;
       
   624 	mul_add_c(a[2],b[7],c1,c2,c3);
       
   625 	mul_add_c(a[3],b[6],c1,c2,c3);
       
   626 	mul_add_c(a[4],b[5],c1,c2,c3);
       
   627 	mul_add_c(a[5],b[4],c1,c2,c3);
       
   628 	mul_add_c(a[6],b[3],c1,c2,c3);
       
   629 	mul_add_c(a[7],b[2],c1,c2,c3);
       
   630 	r[9]=c1;
       
   631 	c1=0;
       
   632 	mul_add_c(a[7],b[3],c2,c3,c1);
       
   633 	mul_add_c(a[6],b[4],c2,c3,c1);
       
   634 	mul_add_c(a[5],b[5],c2,c3,c1);
       
   635 	mul_add_c(a[4],b[6],c2,c3,c1);
       
   636 	mul_add_c(a[3],b[7],c2,c3,c1);
       
   637 	r[10]=c2;
       
   638 	c2=0;
       
   639 	mul_add_c(a[4],b[7],c3,c1,c2);
       
   640 	mul_add_c(a[5],b[6],c3,c1,c2);
       
   641 	mul_add_c(a[6],b[5],c3,c1,c2);
       
   642 	mul_add_c(a[7],b[4],c3,c1,c2);
       
   643 	r[11]=c3;
       
   644 	c3=0;
       
   645 	mul_add_c(a[7],b[5],c1,c2,c3);
       
   646 	mul_add_c(a[6],b[6],c1,c2,c3);
       
   647 	mul_add_c(a[5],b[7],c1,c2,c3);
       
   648 	r[12]=c1;
       
   649 	c1=0;
       
   650 	mul_add_c(a[6],b[7],c2,c3,c1);
       
   651 	mul_add_c(a[7],b[6],c2,c3,c1);
       
   652 	r[13]=c2;
       
   653 	c2=0;
       
   654 	mul_add_c(a[7],b[7],c3,c1,c2);
       
   655 	r[14]=c3;
       
   656 	r[15]=c1;
       
   657 	}
       
   658 
       
   659 EXPORT_C void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
       
   660 	{
       
   661 #ifdef BN_LLONG
       
   662 	BN_ULLONG t;
       
   663 #else
       
   664 	BN_ULONG bl,bh;
       
   665 #endif
       
   666 	BN_ULONG t1,t2;
       
   667 	BN_ULONG c1,c2,c3;
       
   668 
       
   669 	c1=0;
       
   670 	c2=0;
       
   671 	c3=0;
       
   672 	mul_add_c(a[0],b[0],c1,c2,c3);
       
   673 	r[0]=c1;
       
   674 	c1=0;
       
   675 	mul_add_c(a[0],b[1],c2,c3,c1);
       
   676 	mul_add_c(a[1],b[0],c2,c3,c1);
       
   677 	r[1]=c2;
       
   678 	c2=0;
       
   679 	mul_add_c(a[2],b[0],c3,c1,c2);
       
   680 	mul_add_c(a[1],b[1],c3,c1,c2);
       
   681 	mul_add_c(a[0],b[2],c3,c1,c2);
       
   682 	r[2]=c3;
       
   683 	c3=0;
       
   684 	mul_add_c(a[0],b[3],c1,c2,c3);
       
   685 	mul_add_c(a[1],b[2],c1,c2,c3);
       
   686 	mul_add_c(a[2],b[1],c1,c2,c3);
       
   687 	mul_add_c(a[3],b[0],c1,c2,c3);
       
   688 	r[3]=c1;
       
   689 	c1=0;
       
   690 	mul_add_c(a[3],b[1],c2,c3,c1);
       
   691 	mul_add_c(a[2],b[2],c2,c3,c1);
       
   692 	mul_add_c(a[1],b[3],c2,c3,c1);
       
   693 	r[4]=c2;
       
   694 	c2=0;
       
   695 	mul_add_c(a[2],b[3],c3,c1,c2);
       
   696 	mul_add_c(a[3],b[2],c3,c1,c2);
       
   697 	r[5]=c3;
       
   698 	c3=0;
       
   699 	mul_add_c(a[3],b[3],c1,c2,c3);
       
   700 	r[6]=c1;
       
   701 	r[7]=c2;
       
   702 	}
       
   703 
       
   704 EXPORT_C void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
       
   705 	{
       
   706 #ifdef BN_LLONG
       
   707 	BN_ULLONG t,tt;
       
   708 #else
       
   709 	BN_ULONG bl,bh;
       
   710 #endif
       
   711 	BN_ULONG t1,t2;
       
   712 	BN_ULONG c1,c2,c3;
       
   713 
       
   714 	c1=0;
       
   715 	c2=0;
       
   716 	c3=0;
       
   717 	sqr_add_c(a,0,c1,c2,c3);
       
   718 	r[0]=c1;
       
   719 	c1=0;
       
   720 	sqr_add_c2(a,1,0,c2,c3,c1);
       
   721 	r[1]=c2;
       
   722 	c2=0;
       
   723 	sqr_add_c(a,1,c3,c1,c2);
       
   724 	sqr_add_c2(a,2,0,c3,c1,c2);
       
   725 	r[2]=c3;
       
   726 	c3=0;
       
   727 	sqr_add_c2(a,3,0,c1,c2,c3);
       
   728 	sqr_add_c2(a,2,1,c1,c2,c3);
       
   729 	r[3]=c1;
       
   730 	c1=0;
       
   731 	sqr_add_c(a,2,c2,c3,c1);
       
   732 	sqr_add_c2(a,3,1,c2,c3,c1);
       
   733 	sqr_add_c2(a,4,0,c2,c3,c1);
       
   734 	r[4]=c2;
       
   735 	c2=0;
       
   736 	sqr_add_c2(a,5,0,c3,c1,c2);
       
   737 	sqr_add_c2(a,4,1,c3,c1,c2);
       
   738 	sqr_add_c2(a,3,2,c3,c1,c2);
       
   739 	r[5]=c3;
       
   740 	c3=0;
       
   741 	sqr_add_c(a,3,c1,c2,c3);
       
   742 	sqr_add_c2(a,4,2,c1,c2,c3);
       
   743 	sqr_add_c2(a,5,1,c1,c2,c3);
       
   744 	sqr_add_c2(a,6,0,c1,c2,c3);
       
   745 	r[6]=c1;
       
   746 	c1=0;
       
   747 	sqr_add_c2(a,7,0,c2,c3,c1);
       
   748 	sqr_add_c2(a,6,1,c2,c3,c1);
       
   749 	sqr_add_c2(a,5,2,c2,c3,c1);
       
   750 	sqr_add_c2(a,4,3,c2,c3,c1);
       
   751 	r[7]=c2;
       
   752 	c2=0;
       
   753 	sqr_add_c(a,4,c3,c1,c2);
       
   754 	sqr_add_c2(a,5,3,c3,c1,c2);
       
   755 	sqr_add_c2(a,6,2,c3,c1,c2);
       
   756 	sqr_add_c2(a,7,1,c3,c1,c2);
       
   757 	r[8]=c3;
       
   758 	c3=0;
       
   759 	sqr_add_c2(a,7,2,c1,c2,c3);
       
   760 	sqr_add_c2(a,6,3,c1,c2,c3);
       
   761 	sqr_add_c2(a,5,4,c1,c2,c3);
       
   762 	r[9]=c1;
       
   763 	c1=0;
       
   764 	sqr_add_c(a,5,c2,c3,c1);
       
   765 	sqr_add_c2(a,6,4,c2,c3,c1);
       
   766 	sqr_add_c2(a,7,3,c2,c3,c1);
       
   767 	r[10]=c2;
       
   768 	c2=0;
       
   769 	sqr_add_c2(a,7,4,c3,c1,c2);
       
   770 	sqr_add_c2(a,6,5,c3,c1,c2);
       
   771 	r[11]=c3;
       
   772 	c3=0;
       
   773 	sqr_add_c(a,6,c1,c2,c3);
       
   774 	sqr_add_c2(a,7,5,c1,c2,c3);
       
   775 	r[12]=c1;
       
   776 	c1=0;
       
   777 	sqr_add_c2(a,7,6,c2,c3,c1);
       
   778 	r[13]=c2;
       
   779 	c2=0;
       
   780 	sqr_add_c(a,7,c3,c1,c2);
       
   781 	r[14]=c3;
       
   782 	r[15]=c1;
       
   783 	}
       
   784 
       
   785 EXPORT_C void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
       
   786 	{
       
   787 #ifdef BN_LLONG
       
   788 	BN_ULLONG t,tt;
       
   789 #else
       
   790 	BN_ULONG bl,bh;
       
   791 #endif
       
   792 	BN_ULONG t1,t2;
       
   793 	BN_ULONG c1,c2,c3;
       
   794 
       
   795 	c1=0;
       
   796 	c2=0;
       
   797 	c3=0;
       
   798 	sqr_add_c(a,0,c1,c2,c3);
       
   799 	r[0]=c1;
       
   800 	c1=0;
       
   801 	sqr_add_c2(a,1,0,c2,c3,c1);
       
   802 	r[1]=c2;
       
   803 	c2=0;
       
   804 	sqr_add_c(a,1,c3,c1,c2);
       
   805 	sqr_add_c2(a,2,0,c3,c1,c2);
       
   806 	r[2]=c3;
       
   807 	c3=0;
       
   808 	sqr_add_c2(a,3,0,c1,c2,c3);
       
   809 	sqr_add_c2(a,2,1,c1,c2,c3);
       
   810 	r[3]=c1;
       
   811 	c1=0;
       
   812 	sqr_add_c(a,2,c2,c3,c1);
       
   813 	sqr_add_c2(a,3,1,c2,c3,c1);
       
   814 	r[4]=c2;
       
   815 	c2=0;
       
   816 	sqr_add_c2(a,3,2,c3,c1,c2);
       
   817 	r[5]=c3;
       
   818 	c3=0;
       
   819 	sqr_add_c(a,3,c1,c2,c3);
       
   820 	r[6]=c1;
       
   821 	r[7]=c2;
       
   822 	}
       
   823 #else /* !BN_MUL_COMBA */
       
   824 
       
   825 /* hmm... is it faster just to do a multiply? */
       
   826 #undef bn_sqr_comba4
       
   827 EXPORT_C void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
       
   828 	{
       
   829 	BN_ULONG t[8];
       
   830 	bn_sqr_normal(r,a,4,t);
       
   831 	}
       
   832 
       
   833 #undef bn_sqr_comba8
       
   834 EXPORT_C void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
       
   835 	{
       
   836 	BN_ULONG t[16];
       
   837 	bn_sqr_normal(r,a,8,t);
       
   838 	}
       
   839 
       
   840 EXPORT_C void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
       
   841 	{
       
   842 	r[4]=bn_mul_words(    &(r[0]),a,4,b[0]);
       
   843 	r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
       
   844 	r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
       
   845 	r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
       
   846 	}
       
   847 
       
   848 EXPORT_C void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
       
   849 	{
       
   850 	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
       
   851 	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
       
   852 	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
       
   853 	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
       
   854 	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
       
   855 	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
       
   856 	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
       
   857 	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
       
   858 	}
       
   859 
       
   860 #endif /* !BN_MUL_COMBA */