|
1 /* |
|
2 * Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of the License "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include "words.h" |
|
20 #include "algorithms.h" |
|
21 |
|
22 word Add(word *C, const word *A, const word *B, unsigned int N) |
|
23 { |
|
24 assert (N%2 == 0); |
|
25 word carry = 0; |
|
26 for (unsigned int i = 0; i < N; i+=2) |
|
27 { |
|
28 dword u = (dword) carry + A[i] + B[i]; |
|
29 C[i] = LOW_WORD(u); |
|
30 u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1]; |
|
31 C[i+1] = LOW_WORD(u); |
|
32 carry = HIGH_WORD(u); |
|
33 } |
|
34 return carry; |
|
35 } |
|
36 |
|
37 word Subtract(word *C, const word *A, const word *B, unsigned int N) |
|
38 { |
|
39 assert (N%2 == 0); |
|
40 word borrow=0; |
|
41 for (unsigned i = 0; i < N; i+=2) |
|
42 { |
|
43 dword u = (dword) A[i] - B[i] - borrow; |
|
44 C[i] = LOW_WORD(u); |
|
45 u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u)); |
|
46 C[i+1] = LOW_WORD(u); |
|
47 borrow = 0-HIGH_WORD(u); |
|
48 } |
|
49 return borrow; |
|
50 } |
|
51 |
|
52 int Compare(const word *A, const word *B, unsigned int N) |
|
53 { |
|
54 while (N--) |
|
55 if (A[N] > B[N]) |
|
56 return 1; |
|
57 else if (A[N] < B[N]) |
|
58 return -1; |
|
59 |
|
60 return 0; |
|
61 } |
|
62 |
|
63 // It is the job of the calling code to ensure that this won't carry. |
|
64 // If you aren't sure, use the next version that will tell you if you need to |
|
65 // grow your integer. |
|
66 // Having two of these creates ever so slightly more code but avoids having |
|
67 // ifdefs all over the rest of the code checking the following type stuff which |
|
68 // causes warnings in certain compilers about unused parameters in release |
|
69 // builds. We can't have that can we! |
|
70 /* |
|
71 Allows avoid this all over bigint.cpp and primes.cpp |
|
72 ifdef _DEBUG |
|
73 TUint carry = Increment(Ptr(), Size()); |
|
74 assert(!carry); |
|
75 else |
|
76 Increment(Ptr(), Size()) |
|
77 endif |
|
78 */ |
|
79 void IncrementNoCarry(word *A, unsigned int N, word B) |
|
80 { |
|
81 assert(N); |
|
82 word t = A[0]; |
|
83 A[0] = t+B; |
|
84 if (A[0] >= t) |
|
85 return; |
|
86 for (unsigned i=1; i<N; i++) |
|
87 if (++A[i]) |
|
88 return; |
|
89 assert(0); |
|
90 } |
|
91 |
|
92 word Increment(word *A, unsigned int N, word B) |
|
93 { |
|
94 assert(N); |
|
95 word t = A[0]; |
|
96 A[0] = t+B; |
|
97 if (A[0] >= t) |
|
98 return 0; |
|
99 for (unsigned i=1; i<N; i++) |
|
100 if (++A[i]) |
|
101 return 0; |
|
102 return 1; |
|
103 } |
|
104 |
|
105 //See commments above about IncrementNoCarry |
|
106 void DecrementNoCarry(word *A, unsigned int N, word B) |
|
107 { |
|
108 assert(N); |
|
109 word t = A[0]; |
|
110 A[0] = t-B; |
|
111 if (A[0] <= t) |
|
112 return; |
|
113 for (unsigned i=1; i<N; i++) |
|
114 if (A[i]--) |
|
115 return; |
|
116 assert(0); |
|
117 } |
|
118 |
|
119 word Decrement(word *A, unsigned int N, word B) |
|
120 { |
|
121 assert(N); |
|
122 word t = A[0]; |
|
123 A[0] = t-B; |
|
124 if (A[0] <= t) |
|
125 return 0; |
|
126 for (unsigned i=1; i<N; i++) |
|
127 if (A[i]--) |
|
128 return 0; |
|
129 return 1; |
|
130 } |
|
131 |
|
132 void TwosComplement(word *A, unsigned int N) |
|
133 { |
|
134 Decrement(A, N); |
|
135 for (unsigned i=0; i<N; i++) |
|
136 A[i] = ~A[i]; |
|
137 } |
|
138 |
|
139 static word LinearMultiply(word *C, const word *A, word B, unsigned int N) |
|
140 { |
|
141 word carry=0; |
|
142 for(unsigned i=0; i<N; i++) |
|
143 { |
|
144 dword p = (dword)A[i] * B + carry; |
|
145 C[i] = LOW_WORD(p); |
|
146 carry = HIGH_WORD(p); |
|
147 } |
|
148 return carry; |
|
149 } |
|
150 |
|
151 static void AtomicMultiply(word *C, const word *A, const word *B) |
|
152 { |
|
153 /* |
|
154 word s; |
|
155 dword d; |
|
156 |
|
157 if (A1 >= A0) |
|
158 if (B0 >= B1) |
|
159 { |
|
160 s = 0; |
|
161 d = (dword)(A1-A0)*(B0-B1); |
|
162 } |
|
163 else |
|
164 { |
|
165 s = (A1-A0); |
|
166 d = (dword)s*(word)(B0-B1); |
|
167 } |
|
168 else |
|
169 if (B0 > B1) |
|
170 { |
|
171 s = (B0-B1); |
|
172 d = (word)(A1-A0)*(dword)s; |
|
173 } |
|
174 else |
|
175 { |
|
176 s = 0; |
|
177 d = (dword)(A0-A1)*(B1-B0); |
|
178 } |
|
179 */ |
|
180 // this segment is the branchless equivalent of above |
|
181 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; |
|
182 unsigned int ai = A[1] < A[0]; |
|
183 unsigned int bi = B[0] < B[1]; |
|
184 unsigned int di = ai & bi; |
|
185 dword d = (dword)D[di]*D[di+2]; |
|
186 D[1] = D[3] = 0; |
|
187 unsigned int si = ai + !bi; |
|
188 word s = D[si]; |
|
189 |
|
190 dword A0B0 = (dword)A[0]*B[0]; |
|
191 C[0] = LOW_WORD(A0B0); |
|
192 |
|
193 dword A1B1 = (dword)A[1]*B[1]; |
|
194 dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1); |
|
195 C[1] = LOW_WORD(t); |
|
196 |
|
197 t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s; |
|
198 C[2] = LOW_WORD(t); |
|
199 C[3] = HIGH_WORD(t); |
|
200 } |
|
201 |
|
202 static word AtomicMultiplyAdd(word *C, const word *A, const word *B) |
|
203 { |
|
204 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; |
|
205 unsigned int ai = A[1] < A[0]; |
|
206 unsigned int bi = B[0] < B[1]; |
|
207 unsigned int di = ai & bi; |
|
208 dword d = (dword)D[di]*D[di+2]; |
|
209 D[1] = D[3] = 0; |
|
210 unsigned int si = ai + !bi; |
|
211 word s = D[si]; |
|
212 |
|
213 dword A0B0 = (dword)A[0]*B[0]; |
|
214 dword t = A0B0 + C[0]; |
|
215 C[0] = LOW_WORD(t); |
|
216 |
|
217 dword A1B1 = (dword)A[1]*B[1]; |
|
218 t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1]; |
|
219 C[1] = LOW_WORD(t); |
|
220 |
|
221 t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2]; |
|
222 C[2] = LOW_WORD(t); |
|
223 |
|
224 t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3]; |
|
225 C[3] = LOW_WORD(t); |
|
226 return HIGH_WORD(t); |
|
227 } |
|
228 |
|
229 static inline void AtomicMultiplyBottom(word *C, const word *A, const word *B) |
|
230 { |
|
231 dword t = (dword)A[0]*B[0]; |
|
232 C[0] = LOW_WORD(t); |
|
233 C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0]; |
|
234 } |
|
235 |
|
236 #define MulAcc(x, y) \ |
|
237 p = (dword)A[x] * B[y] + c; \ |
|
238 c = LOW_WORD(p); \ |
|
239 p = (dword)d + HIGH_WORD(p); \ |
|
240 d = LOW_WORD(p); \ |
|
241 e += HIGH_WORD(p); |
|
242 |
|
243 #define SaveMulAcc(s, x, y) \ |
|
244 R[s] = c; \ |
|
245 p = (dword)A[x] * B[y] + d; \ |
|
246 c = LOW_WORD(p); \ |
|
247 p = (dword)e + HIGH_WORD(p); \ |
|
248 d = LOW_WORD(p); \ |
|
249 e = HIGH_WORD(p); |
|
250 |
|
251 #define MulAcc1(x, y) \ |
|
252 p = (dword)A[x] * A[y] + c; \ |
|
253 c = LOW_WORD(p); \ |
|
254 p = (dword)d + HIGH_WORD(p); \ |
|
255 d = LOW_WORD(p); \ |
|
256 e += HIGH_WORD(p); |
|
257 |
|
258 #define SaveMulAcc1(s, x, y) \ |
|
259 R[s] = c; \ |
|
260 p = (dword)A[x] * A[y] + d; \ |
|
261 c = LOW_WORD(p); \ |
|
262 p = (dword)e + HIGH_WORD(p); \ |
|
263 d = LOW_WORD(p); \ |
|
264 e = HIGH_WORD(p); |
|
265 |
|
266 #define SquAcc(x, y) \ |
|
267 p = (dword)A[x] * A[y]; \ |
|
268 p = p + p + c; \ |
|
269 c = LOW_WORD(p); \ |
|
270 p = (dword)d + HIGH_WORD(p); \ |
|
271 d = LOW_WORD(p); \ |
|
272 e += HIGH_WORD(p); |
|
273 |
|
274 #define SaveSquAcc(s, x, y) \ |
|
275 R[s] = c; \ |
|
276 p = (dword)A[x] * A[y]; \ |
|
277 p = p + p + d; \ |
|
278 c = LOW_WORD(p); \ |
|
279 p = (dword)e + HIGH_WORD(p); \ |
|
280 d = LOW_WORD(p); \ |
|
281 e = HIGH_WORD(p); |
|
282 |
|
283 // VC60 workaround: MSVC 6.0 has an optimization problem that makes |
|
284 // (dword)A*B where either A or B has been cast to a dword before |
|
285 // very expensive. Revisit a CombaSquare4() function when this |
|
286 // problem is fixed. |
|
287 |
|
288 // WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware |
|
289 // either. I've completely removed it. It may be worth looking into sometime |
|
290 // in the future. |
|
291 /*#ifndef __WINS__ |
|
292 static void CombaSquare4(word *R, const word *A) |
|
293 { |
|
294 dword p; |
|
295 word c, d, e; |
|
296 |
|
297 p = (dword)A[0] * A[0]; |
|
298 R[0] = LOW_WORD(p); |
|
299 c = HIGH_WORD(p); |
|
300 d = e = 0; |
|
301 |
|
302 SquAcc(0, 1); |
|
303 |
|
304 SaveSquAcc(1, 2, 0); |
|
305 MulAcc1(1, 1); |
|
306 |
|
307 SaveSquAcc(2, 0, 3); |
|
308 SquAcc(1, 2); |
|
309 |
|
310 SaveSquAcc(3, 3, 1); |
|
311 MulAcc1(2, 2); |
|
312 |
|
313 SaveSquAcc(4, 2, 3); |
|
314 |
|
315 R[5] = c; |
|
316 p = (dword)A[3] * A[3] + d; |
|
317 R[6] = LOW_WORD(p); |
|
318 R[7] = e + HIGH_WORD(p); |
|
319 } |
|
320 #endif */ |
|
321 |
|
322 static void CombaMultiply4(word *R, const word *A, const word *B) |
|
323 { |
|
324 dword p; |
|
325 word c, d, e; |
|
326 |
|
327 p = (dword)A[0] * B[0]; |
|
328 R[0] = LOW_WORD(p); |
|
329 c = HIGH_WORD(p); |
|
330 d = e = 0; |
|
331 |
|
332 MulAcc(0, 1); |
|
333 MulAcc(1, 0); |
|
334 |
|
335 SaveMulAcc(1, 2, 0); |
|
336 MulAcc(1, 1); |
|
337 MulAcc(0, 2); |
|
338 |
|
339 SaveMulAcc(2, 0, 3); |
|
340 MulAcc(1, 2); |
|
341 MulAcc(2, 1); |
|
342 MulAcc(3, 0); |
|
343 |
|
344 SaveMulAcc(3, 3, 1); |
|
345 MulAcc(2, 2); |
|
346 MulAcc(1, 3); |
|
347 |
|
348 SaveMulAcc(4, 2, 3); |
|
349 MulAcc(3, 2); |
|
350 |
|
351 R[5] = c; |
|
352 p = (dword)A[3] * B[3] + d; |
|
353 R[6] = LOW_WORD(p); |
|
354 R[7] = e + HIGH_WORD(p); |
|
355 } |
|
356 |
|
357 static void CombaMultiply8(word *R, const word *A, const word *B) |
|
358 { |
|
359 dword p; |
|
360 word c, d, e; |
|
361 |
|
362 p = (dword)A[0] * B[0]; |
|
363 R[0] = LOW_WORD(p); |
|
364 c = HIGH_WORD(p); |
|
365 d = e = 0; |
|
366 |
|
367 MulAcc(0, 1); |
|
368 MulAcc(1, 0); |
|
369 |
|
370 SaveMulAcc(1, 2, 0); |
|
371 MulAcc(1, 1); |
|
372 MulAcc(0, 2); |
|
373 |
|
374 SaveMulAcc(2, 0, 3); |
|
375 MulAcc(1, 2); |
|
376 MulAcc(2, 1); |
|
377 MulAcc(3, 0); |
|
378 |
|
379 SaveMulAcc(3, 0, 4); |
|
380 MulAcc(1, 3); |
|
381 MulAcc(2, 2); |
|
382 MulAcc(3, 1); |
|
383 MulAcc(4, 0); |
|
384 |
|
385 SaveMulAcc(4, 0, 5); |
|
386 MulAcc(1, 4); |
|
387 MulAcc(2, 3); |
|
388 MulAcc(3, 2); |
|
389 MulAcc(4, 1); |
|
390 MulAcc(5, 0); |
|
391 |
|
392 SaveMulAcc(5, 0, 6); |
|
393 MulAcc(1, 5); |
|
394 MulAcc(2, 4); |
|
395 MulAcc(3, 3); |
|
396 MulAcc(4, 2); |
|
397 MulAcc(5, 1); |
|
398 MulAcc(6, 0); |
|
399 |
|
400 SaveMulAcc(6, 0, 7); |
|
401 MulAcc(1, 6); |
|
402 MulAcc(2, 5); |
|
403 MulAcc(3, 4); |
|
404 MulAcc(4, 3); |
|
405 MulAcc(5, 2); |
|
406 MulAcc(6, 1); |
|
407 MulAcc(7, 0); |
|
408 |
|
409 SaveMulAcc(7, 1, 7); |
|
410 MulAcc(2, 6); |
|
411 MulAcc(3, 5); |
|
412 MulAcc(4, 4); |
|
413 MulAcc(5, 3); |
|
414 MulAcc(6, 2); |
|
415 MulAcc(7, 1); |
|
416 |
|
417 SaveMulAcc(8, 2, 7); |
|
418 MulAcc(3, 6); |
|
419 MulAcc(4, 5); |
|
420 MulAcc(5, 4); |
|
421 MulAcc(6, 3); |
|
422 MulAcc(7, 2); |
|
423 |
|
424 SaveMulAcc(9, 3, 7); |
|
425 MulAcc(4, 6); |
|
426 MulAcc(5, 5); |
|
427 MulAcc(6, 4); |
|
428 MulAcc(7, 3); |
|
429 |
|
430 SaveMulAcc(10, 4, 7); |
|
431 MulAcc(5, 6); |
|
432 MulAcc(6, 5); |
|
433 MulAcc(7, 4); |
|
434 |
|
435 SaveMulAcc(11, 5, 7); |
|
436 MulAcc(6, 6); |
|
437 MulAcc(7, 5); |
|
438 |
|
439 SaveMulAcc(12, 6, 7); |
|
440 MulAcc(7, 6); |
|
441 |
|
442 R[13] = c; |
|
443 p = (dword)A[7] * B[7] + d; |
|
444 R[14] = LOW_WORD(p); |
|
445 R[15] = e + HIGH_WORD(p); |
|
446 } |
|
447 |
|
448 static void CombaMultiplyBottom4(word *R, const word *A, const word *B) |
|
449 { |
|
450 dword p; |
|
451 word c, d, e; |
|
452 |
|
453 p = (dword)A[0] * B[0]; |
|
454 R[0] = LOW_WORD(p); |
|
455 c = HIGH_WORD(p); |
|
456 d = e = 0; |
|
457 |
|
458 MulAcc(0, 1); |
|
459 MulAcc(1, 0); |
|
460 |
|
461 SaveMulAcc(1, 2, 0); |
|
462 MulAcc(1, 1); |
|
463 MulAcc(0, 2); |
|
464 |
|
465 R[2] = c; |
|
466 R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0]; |
|
467 } |
|
468 |
|
469 static void CombaMultiplyBottom8(word *R, const word *A, const word *B) |
|
470 { |
|
471 dword p; |
|
472 word c, d, e; |
|
473 |
|
474 p = (dword)A[0] * B[0]; |
|
475 R[0] = LOW_WORD(p); |
|
476 c = HIGH_WORD(p); |
|
477 d = e = 0; |
|
478 |
|
479 MulAcc(0, 1); |
|
480 MulAcc(1, 0); |
|
481 |
|
482 SaveMulAcc(1, 2, 0); |
|
483 MulAcc(1, 1); |
|
484 MulAcc(0, 2); |
|
485 |
|
486 SaveMulAcc(2, 0, 3); |
|
487 MulAcc(1, 2); |
|
488 MulAcc(2, 1); |
|
489 MulAcc(3, 0); |
|
490 |
|
491 SaveMulAcc(3, 0, 4); |
|
492 MulAcc(1, 3); |
|
493 MulAcc(2, 2); |
|
494 MulAcc(3, 1); |
|
495 MulAcc(4, 0); |
|
496 |
|
497 SaveMulAcc(4, 0, 5); |
|
498 MulAcc(1, 4); |
|
499 MulAcc(2, 3); |
|
500 MulAcc(3, 2); |
|
501 MulAcc(4, 1); |
|
502 MulAcc(5, 0); |
|
503 |
|
504 SaveMulAcc(5, 0, 6); |
|
505 MulAcc(1, 5); |
|
506 MulAcc(2, 4); |
|
507 MulAcc(3, 3); |
|
508 MulAcc(4, 2); |
|
509 MulAcc(5, 1); |
|
510 MulAcc(6, 0); |
|
511 |
|
512 R[6] = c; |
|
513 R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] + |
|
514 A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0]; |
|
515 } |
|
516 |
|
517 #undef MulAcc |
|
518 #undef SaveMulAcc |
|
519 static void AtomicInverseModPower2(word *C, word A0, word A1) |
|
520 { |
|
521 assert(A0%2==1); |
|
522 |
|
523 dword A=MAKE_DWORD(A0, A1), R=A0%8; |
|
524 |
|
525 for (unsigned i=3; i<2*WORD_BITS; i*=2) |
|
526 R = R*(2-R*A); |
|
527 |
|
528 assert(R*A==1); |
|
529 |
|
530 C[0] = LOW_WORD(R); |
|
531 C[1] = HIGH_WORD(R); |
|
532 } |
|
533 // ******************************************************** |
|
534 |
|
535 #define A0 A |
|
536 #define A1 (A+N2) |
|
537 #define B0 B |
|
538 #define B1 (B+N2) |
|
539 |
|
540 #define T0 T |
|
541 #define T1 (T+N2) |
|
542 #define T2 (T+N) |
|
543 #define T3 (T+N+N2) |
|
544 |
|
545 #define R0 R |
|
546 #define R1 (R+N2) |
|
547 #define R2 (R+N) |
|
548 #define R3 (R+N+N2) |
|
549 |
|
550 // R[2*N] - result = A*B |
|
551 // T[2*N] - temporary work space |
|
552 // A[N] --- multiplier |
|
553 // B[N] --- multiplicant |
|
554 |
|
555 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N) |
|
556 { |
|
557 assert(N>=2 && N%2==0); |
|
558 |
|
559 if (N==2) |
|
560 AtomicMultiply(R, A, B); |
|
561 else if (N==4) |
|
562 CombaMultiply4(R, A, B); |
|
563 else if (N==8) |
|
564 CombaMultiply8(R, A, B); |
|
565 else |
|
566 { |
|
567 const unsigned int N2 = N/2; |
|
568 int carry; |
|
569 |
|
570 int aComp = Compare(A0, A1, N2); |
|
571 int bComp = Compare(B0, B1, N2); |
|
572 |
|
573 switch (2*aComp + aComp + bComp) |
|
574 { |
|
575 case -4: |
|
576 Subtract(R0, A1, A0, N2); |
|
577 Subtract(R1, B0, B1, N2); |
|
578 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
579 Subtract(T1, T1, R0, N2); |
|
580 carry = -1; |
|
581 break; |
|
582 case -2: |
|
583 Subtract(R0, A1, A0, N2); |
|
584 Subtract(R1, B0, B1, N2); |
|
585 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
586 carry = 0; |
|
587 break; |
|
588 case 2: |
|
589 Subtract(R0, A0, A1, N2); |
|
590 Subtract(R1, B1, B0, N2); |
|
591 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
592 carry = 0; |
|
593 break; |
|
594 case 4: |
|
595 Subtract(R0, A1, A0, N2); |
|
596 Subtract(R1, B0, B1, N2); |
|
597 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
598 Subtract(T1, T1, R1, N2); |
|
599 carry = -1; |
|
600 break; |
|
601 default: |
|
602 SetWords(T0, 0, N); |
|
603 carry = 0; |
|
604 } |
|
605 |
|
606 RecursiveMultiply(R0, T2, A0, B0, N2); |
|
607 RecursiveMultiply(R2, T2, A1, B1, N2); |
|
608 |
|
609 // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1 |
|
610 |
|
611 carry += Add(T0, T0, R0, N); |
|
612 carry += Add(T0, T0, R2, N); |
|
613 carry += Add(R1, R1, T0, N); |
|
614 |
|
615 assert (carry >= 0 && carry <= 2); |
|
616 Increment(R3, N2, carry); |
|
617 } |
|
618 } |
|
619 |
|
620 // R[2*N] - result = A*A |
|
621 // T[2*N] - temporary work space |
|
622 // A[N] --- number to be squared |
|
623 |
|
624 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N) |
|
625 { |
|
626 assert(N && N%2==0); |
|
627 |
|
628 if (N==2) |
|
629 AtomicMultiply(R, A, A); |
|
630 else if (N==4) |
|
631 { |
|
632 // VC60 workaround: MSVC 6.0 has an optimization problem that makes |
|
633 // (dword)A*B where either A or B has been cast to a dword before |
|
634 // very expensive. Revisit a CombaSquare4() function when this |
|
635 // problem is fixed. |
|
636 |
|
637 // WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware |
|
638 // either. I've completely removed it. It may be worth looking into sometime |
|
639 // in the future. Therefore, we use the CombaMultiply4 on all targets. |
|
640 //#ifdef __WINS__ |
|
641 CombaMultiply4(R, A, A); |
|
642 /*#else |
|
643 CombaSquare4(R, A); |
|
644 #endif*/ |
|
645 } |
|
646 else |
|
647 { |
|
648 const unsigned int N2 = N/2; |
|
649 |
|
650 RecursiveSquare(R0, T2, A0, N2); |
|
651 RecursiveSquare(R2, T2, A1, N2); |
|
652 RecursiveMultiply(T0, T2, A0, A1, N2); |
|
653 |
|
654 word carry = Add(R1, R1, T0, N); |
|
655 carry += Add(R1, R1, T0, N); |
|
656 Increment(R3, N2, carry); |
|
657 } |
|
658 } |
|
659 // R[N] - bottom half of A*B |
|
660 // T[N] - temporary work space |
|
661 // A[N] - multiplier |
|
662 // B[N] - multiplicant |
|
663 |
|
664 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N) |
|
665 { |
|
666 assert(N>=2 && N%2==0); |
|
667 |
|
668 if (N==2) |
|
669 AtomicMultiplyBottom(R, A, B); |
|
670 else if (N==4) |
|
671 CombaMultiplyBottom4(R, A, B); |
|
672 else if (N==8) |
|
673 CombaMultiplyBottom8(R, A, B); |
|
674 else |
|
675 { |
|
676 const unsigned int N2 = N/2; |
|
677 |
|
678 RecursiveMultiply(R, T, A0, B0, N2); |
|
679 RecursiveMultiplyBottom(T0, T1, A1, B0, N2); |
|
680 Add(R1, R1, T0, N2); |
|
681 RecursiveMultiplyBottom(T0, T1, A0, B1, N2); |
|
682 Add(R1, R1, T0, N2); |
|
683 } |
|
684 } |
|
685 |
|
686 // R[N] --- upper half of A*B |
|
687 // T[2*N] - temporary work space |
|
688 // L[N] --- lower half of A*B |
|
689 // A[N] --- multiplier |
|
690 // B[N] --- multiplicant |
|
691 |
|
692 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N) |
|
693 { |
|
694 assert(N>=2 && N%2==0); |
|
695 |
|
696 if (N==2) |
|
697 { |
|
698 AtomicMultiply(T, A, B); |
|
699 ((dword *)R)[0] = ((dword *)T)[1]; |
|
700 } |
|
701 else if (N==4) |
|
702 { |
|
703 CombaMultiply4(T, A, B); |
|
704 ((dword *)R)[0] = ((dword *)T)[2]; |
|
705 ((dword *)R)[1] = ((dword *)T)[3]; |
|
706 } |
|
707 else |
|
708 { |
|
709 const unsigned int N2 = N/2; |
|
710 int carry; |
|
711 |
|
712 int aComp = Compare(A0, A1, N2); |
|
713 int bComp = Compare(B0, B1, N2); |
|
714 |
|
715 switch (2*aComp + aComp + bComp) |
|
716 { |
|
717 case -4: |
|
718 Subtract(R0, A1, A0, N2); |
|
719 Subtract(R1, B0, B1, N2); |
|
720 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
721 Subtract(T1, T1, R0, N2); |
|
722 carry = -1; |
|
723 break; |
|
724 case -2: |
|
725 Subtract(R0, A1, A0, N2); |
|
726 Subtract(R1, B0, B1, N2); |
|
727 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
728 carry = 0; |
|
729 break; |
|
730 case 2: |
|
731 Subtract(R0, A0, A1, N2); |
|
732 Subtract(R1, B1, B0, N2); |
|
733 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
734 carry = 0; |
|
735 break; |
|
736 case 4: |
|
737 Subtract(R0, A1, A0, N2); |
|
738 Subtract(R1, B0, B1, N2); |
|
739 RecursiveMultiply(T0, T2, R0, R1, N2); |
|
740 Subtract(T1, T1, R1, N2); |
|
741 carry = -1; |
|
742 break; |
|
743 default: |
|
744 SetWords(T0, 0, N); |
|
745 carry = 0; |
|
746 } |
|
747 |
|
748 RecursiveMultiply(T2, R0, A1, B1, N2); |
|
749 |
|
750 // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1 |
|
751 |
|
752 CopyWords(R0, L+N2, N2); |
|
753 word c2 = Subtract(R0, R0, L, N2); |
|
754 c2 += Subtract(R0, R0, T0, N2); |
|
755 word t = (Compare(R0, T2, N2) == -1); |
|
756 |
|
757 carry += t; |
|
758 carry += Increment(R0, N2, c2+t); |
|
759 carry += Add(R0, R0, T1, N2); |
|
760 carry += Add(R0, R0, T3, N2); |
|
761 |
|
762 CopyWords(R1, T3, N2); |
|
763 assert (carry >= 0 && carry <= 2); |
|
764 Increment(R1, N2, carry); |
|
765 } |
|
766 } |
|
767 |
|
768 // R[NA+NB] - result = A*B |
|
769 // T[NA+NB] - temporary work space |
|
770 // A[NA] ---- multiplier |
|
771 // B[NB] ---- multiplicant |
|
772 |
|
773 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB) |
|
774 { |
|
775 if (NA == NB) |
|
776 { |
|
777 if (A == B) |
|
778 RecursiveSquare(R, T, A, NA); |
|
779 else |
|
780 RecursiveMultiply(R, T, A, B, NA); |
|
781 |
|
782 return; |
|
783 } |
|
784 |
|
785 if (NA > NB) |
|
786 { |
|
787 TClassSwap(A, B); |
|
788 TClassSwap(NA, NB); |
|
789 //std::swap(A, B); |
|
790 //std::swap(NA, NB); |
|
791 } |
|
792 |
|
793 assert(NB % NA == 0); |
|
794 assert((NB/NA)%2 == 0); // NB is an even multiple of NA |
|
795 |
|
796 if (NA==2 && !A[1]) |
|
797 { |
|
798 switch (A[0]) |
|
799 { |
|
800 case 0: |
|
801 SetWords(R, 0, NB+2); |
|
802 return; |
|
803 case 1: |
|
804 CopyWords(R, B, NB); |
|
805 R[NB] = R[NB+1] = 0; |
|
806 return; |
|
807 default: |
|
808 R[NB] = LinearMultiply(R, B, A[0], NB); |
|
809 R[NB+1] = 0; |
|
810 return; |
|
811 } |
|
812 } |
|
813 |
|
814 RecursiveMultiply(R, T, A, B, NA); |
|
815 CopyWords(T+2*NA, R+NA, NA); |
|
816 |
|
817 unsigned i; |
|
818 |
|
819 for (i=2*NA; i<NB; i+=2*NA) |
|
820 RecursiveMultiply(T+NA+i, T, A, B+i, NA); |
|
821 for (i=NA; i<NB; i+=2*NA) |
|
822 RecursiveMultiply(R+i, T, A, B+i, NA); |
|
823 |
|
824 if (Add(R+NA, R+NA, T+2*NA, NB-NA)) |
|
825 Increment(R+NB, NA); |
|
826 } |
|
827 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N) |
|
828 // T[3*N/2] - temporary work space |
|
829 // A[N] ----- an odd number as input |
|
830 |
|
831 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N) |
|
832 { |
|
833 if (N==2) |
|
834 AtomicInverseModPower2(R, A[0], A[1]); |
|
835 else |
|
836 { |
|
837 const unsigned int N2 = N/2; |
|
838 RecursiveInverseModPower2(R0, T0, A0, N2); |
|
839 T0[0] = 1; |
|
840 SetWords(T0+1, 0, N2-1); |
|
841 RecursiveMultiplyTop(R1, T1, T0, R0, A0, N2); |
|
842 RecursiveMultiplyBottom(T0, T1, R0, A1, N2); |
|
843 Add(T0, R1, T0, N2); |
|
844 TwosComplement(T0, N2); |
|
845 RecursiveMultiplyBottom(R1, T1, R0, T0, N2); |
|
846 } |
|
847 } |
|
848 #undef A0 |
|
849 #undef A1 |
|
850 #undef B0 |
|
851 #undef B1 |
|
852 |
|
853 #undef T0 |
|
854 #undef T1 |
|
855 #undef T2 |
|
856 #undef T3 |
|
857 |
|
858 #undef R0 |
|
859 #undef R1 |
|
860 #undef R2 |
|
861 #undef R3 |
|
862 |
|
863 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M |
|
864 // T[3*N] - temporary work space |
|
865 // X[2*N] - number to be reduced |
|
866 // M[N] --- modulus |
|
867 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N) |
|
868 |
|
869 void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N) |
|
870 { |
|
871 RecursiveMultiplyBottom(R, T, X, U, N); |
|
872 RecursiveMultiplyTop(T, T+N, X, R, M, N); |
|
873 if (Subtract(R, X+N, T, N)) |
|
874 { |
|
875 #ifdef _DEBUG |
|
876 word carry = Add(R, R, M, N); |
|
877 assert(carry); |
|
878 #else |
|
879 Add(R, R, M, N); |
|
880 #endif |
|
881 } |
|
882 } |
|
883 |
|
884 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A |
|
885 static word SubatomicDivide(word *A, word B0, word B1) |
|
886 { |
|
887 // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word |
|
888 assert(A[2] < B1 || (A[2]==B1 && A[1] < B0)); |
|
889 |
|
890 dword p, u; |
|
891 word Q; |
|
892 |
|
893 // estimate the quotient: do a 2 word by 1 word divide |
|
894 if (B1+1 == 0) |
|
895 Q = A[2]; |
|
896 else |
|
897 Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1)); |
|
898 |
|
899 // now subtract Q*B from A |
|
900 p = (dword) B0*Q; |
|
901 u = (dword) A[0] - LOW_WORD(p); |
|
902 A[0] = LOW_WORD(u); |
|
903 u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q; |
|
904 A[1] = LOW_WORD(u); |
|
905 A[2] += HIGH_WORD(u); |
|
906 |
|
907 // Q <= actual quotient, so fix it |
|
908 while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0)) |
|
909 { |
|
910 u = (dword) A[0] - B0; |
|
911 A[0] = LOW_WORD(u); |
|
912 u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u)); |
|
913 A[1] = LOW_WORD(u); |
|
914 A[2] += HIGH_WORD(u); |
|
915 Q++; |
|
916 assert(Q); // shouldn't overflow |
|
917 } |
|
918 |
|
919 return Q; |
|
920 } |
|
921 |
|
922 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1 |
|
923 static inline void AtomicDivide(word *Q, const word *A, const word *B) |
|
924 { |
|
925 if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS) |
|
926 { |
|
927 Q[0] = A[2]; |
|
928 Q[1] = A[3]; |
|
929 } |
|
930 else |
|
931 { |
|
932 word T[4]; |
|
933 T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3]; |
|
934 Q[1] = SubatomicDivide(T+1, B[0], B[1]); |
|
935 Q[0] = SubatomicDivide(T, B[0], B[1]); |
|
936 |
|
937 #ifdef _DEBUG |
|
938 // multiply quotient and divisor and add remainder, make sure it equals dividend |
|
939 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0]))); |
|
940 word P[4]; |
|
941 AtomicMultiply(P, Q, B); |
|
942 Add(P, P, T, 4); |
|
943 assert(Mem::Compare((TUint8*)P, 4*WORD_SIZE, (TUint8*)A, 4*WORD_SIZE)==0); |
|
944 #endif |
|
945 } |
|
946 } |
|
947 |
|
948 // for use by Divide(), corrects the underestimated quotient {Q1,Q0} |
|
949 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N) |
|
950 { |
|
951 assert(N && N%2==0); |
|
952 |
|
953 if (Q[1]) |
|
954 { |
|
955 T[N] = T[N+1] = 0; |
|
956 unsigned i; |
|
957 for (i=0; i<N; i+=4) |
|
958 AtomicMultiply(T+i, Q, B+i); |
|
959 for (i=2; i<N; i+=4) |
|
960 if (AtomicMultiplyAdd(T+i, Q, B+i)) |
|
961 T[i+5] += (++T[i+4]==0); |
|
962 } |
|
963 else |
|
964 { |
|
965 T[N] = LinearMultiply(T, B, Q[0], N); |
|
966 T[N+1] = 0; |
|
967 } |
|
968 |
|
969 #ifdef _DEBUG |
|
970 word borrow = Subtract(R, R, T, N+2); |
|
971 assert(!borrow && !R[N+1]); |
|
972 #else |
|
973 Subtract(R, R, T, N+2); |
|
974 #endif |
|
975 |
|
976 while (R[N] || Compare(R, B, N) >= 0) |
|
977 { |
|
978 R[N] -= Subtract(R, R, B, N); |
|
979 Q[1] += (++Q[0]==0); |
|
980 assert(Q[0] || Q[1]); // no overflow |
|
981 } |
|
982 } |
|
983 |
|
984 // R[NB] -------- remainder = A%B |
|
985 // Q[NA-NB+2] --- quotient = A/B |
|
986 // T[NA+2*NB+4] - temp work space |
|
987 // A[NA] -------- dividend |
|
988 // B[NB] -------- divisor |
|
989 |
|
990 void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB) |
|
991 { |
|
992 assert(NA && NB && NA%2==0 && NB%2==0); |
|
993 assert(B[NB-1] || B[NB-2]); |
|
994 assert(NB <= NA); |
|
995 |
|
996 // set up temporary work space |
|
997 word *const TA=T; |
|
998 word *const TB=T+NA+2; |
|
999 word *const TP=T+NA+2+NB; |
|
1000 |
|
1001 // copy B into TB and normalize it so that TB has highest bit set to 1 |
|
1002 unsigned shiftWords = (B[NB-1]==0); |
|
1003 TB[0] = TB[NB-1] = 0; |
|
1004 CopyWords(TB+shiftWords, B, NB-shiftWords); |
|
1005 unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]); |
|
1006 assert(shiftBits < WORD_BITS); |
|
1007 ShiftWordsLeftByBits(TB, NB, shiftBits); |
|
1008 |
|
1009 // copy A into TA and normalize it |
|
1010 TA[0] = TA[NA] = TA[NA+1] = 0; |
|
1011 CopyWords(TA+shiftWords, A, NA); |
|
1012 ShiftWordsLeftByBits(TA, NA+2, shiftBits); |
|
1013 |
|
1014 if (TA[NA+1]==0 && TA[NA] <= 1) |
|
1015 { |
|
1016 Q[NA-NB+1] = Q[NA-NB] = 0; |
|
1017 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0) |
|
1018 { |
|
1019 TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB); |
|
1020 ++Q[NA-NB]; |
|
1021 } |
|
1022 } |
|
1023 else |
|
1024 { |
|
1025 NA+=2; |
|
1026 assert(Compare(TA+NA-NB, TB, NB) < 0); |
|
1027 } |
|
1028 |
|
1029 word BT[2]; |
|
1030 BT[0] = TB[NB-2] + 1; |
|
1031 BT[1] = TB[NB-1] + (BT[0]==0); |
|
1032 |
|
1033 // start reducing TA mod TB, 2 words at a time |
|
1034 for (unsigned i=NA-2; i>=NB; i-=2) |
|
1035 { |
|
1036 AtomicDivide(Q+i-NB, TA+i-2, BT); |
|
1037 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB); |
|
1038 } |
|
1039 |
|
1040 // copy TA into R, and denormalize it |
|
1041 CopyWords(R, TA+shiftWords, NB); |
|
1042 ShiftWordsRightByBits(R, NB, shiftBits); |
|
1043 } |
|
1044 |
|
1045 static inline unsigned int EvenWordCount(const word *X, unsigned int N) |
|
1046 { |
|
1047 while (N && X[N-2]==0 && X[N-1]==0) |
|
1048 N-=2; |
|
1049 return N; |
|
1050 } |
|
1051 |
|
1052 // return k |
|
1053 // R[N] --- result = A^(-1) * 2^k mod M |
|
1054 // T[4*N] - temporary work space |
|
1055 // A[NA] -- number to take inverse of |
|
1056 // M[N] --- modulus |
|
1057 |
|
1058 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N) |
|
1059 { |
|
1060 assert(NA<=N && N && N%2==0); |
|
1061 |
|
1062 word *b = T; |
|
1063 word *c = T+N; |
|
1064 word *f = T+2*N; |
|
1065 word *g = T+3*N; |
|
1066 unsigned int bcLen=2, fgLen=EvenWordCount(M, N); |
|
1067 unsigned int k=0, s=0; |
|
1068 |
|
1069 SetWords(T, 0, 3*N); |
|
1070 b[0]=1; |
|
1071 CopyWords(f, A, NA); |
|
1072 CopyWords(g, M, N); |
|
1073 |
|
1074 FOREVER |
|
1075 { |
|
1076 word t=f[0]; |
|
1077 while (!t) |
|
1078 { |
|
1079 if (EvenWordCount(f, fgLen)==0) |
|
1080 { |
|
1081 SetWords(R, 0, N); |
|
1082 return 0; |
|
1083 } |
|
1084 |
|
1085 ShiftWordsRightByWords(f, fgLen, 1); |
|
1086 if (c[bcLen-1]) bcLen+=2; |
|
1087 assert(bcLen <= N); |
|
1088 ShiftWordsLeftByWords(c, bcLen, 1); |
|
1089 k+=WORD_BITS; |
|
1090 t=f[0]; |
|
1091 } |
|
1092 |
|
1093 unsigned int i=0; |
|
1094 while (t%2 == 0) |
|
1095 { |
|
1096 t>>=1; |
|
1097 i++; |
|
1098 } |
|
1099 k+=i; |
|
1100 |
|
1101 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2) |
|
1102 { |
|
1103 if (s%2==0) |
|
1104 CopyWords(R, b, N); |
|
1105 else |
|
1106 Subtract(R, M, b, N); |
|
1107 return k; |
|
1108 } |
|
1109 |
|
1110 ShiftWordsRightByBits(f, fgLen, i); |
|
1111 t=ShiftWordsLeftByBits(c, bcLen, i); |
|
1112 if (t) |
|
1113 { |
|
1114 c[bcLen] = t; |
|
1115 bcLen+=2; |
|
1116 assert(bcLen <= N); |
|
1117 } |
|
1118 |
|
1119 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0) |
|
1120 fgLen-=2; |
|
1121 |
|
1122 if (Compare(f, g, fgLen)==-1) |
|
1123 { |
|
1124 TClassSwap<word*>(f,g); |
|
1125 TClassSwap<word*>(b,c); |
|
1126 s++; |
|
1127 } |
|
1128 |
|
1129 Subtract(f, f, g, fgLen); |
|
1130 |
|
1131 if (Add(b, b, c, bcLen)) |
|
1132 { |
|
1133 b[bcLen] = 1; |
|
1134 bcLen+=2; |
|
1135 assert(bcLen <= N); |
|
1136 } |
|
1137 } |
|
1138 } |
|
1139 |
|
1140 // R[N] - result = A/(2^k) mod M |
|
1141 // A[N] - input |
|
1142 // M[N] - modulus |
|
1143 |
|
1144 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N) |
|
1145 { |
|
1146 CopyWords(R, A, N); |
|
1147 |
|
1148 while (k--) |
|
1149 { |
|
1150 if (R[0]%2==0) |
|
1151 ShiftWordsRightByBits(R, N, 1); |
|
1152 else |
|
1153 { |
|
1154 word carry = Add(R, R, M, N); |
|
1155 ShiftWordsRightByBits(R, N, 1); |
|
1156 R[N-1] += carry<<(WORD_BITS-1); |
|
1157 } |
|
1158 } |
|
1159 } |
|
1160 |