Skip to content

Commit ea10e96

Browse files
Tom St Denissjaeckel
authored andcommitted
added tomsfastmath-0.10
1 parent 4b43916 commit ea10e96

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+18859
-15419
lines changed

TODO

Whitespace-only changes.

changes.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
November 1st, 2006
2+
0.10 -- Unrolled Montgomery for 1..16 digits with TFM_SMALL_MONT_SET between 10% and 25% speedup depending on size.
3+
-- fixed fp_sqr_comba.c so it builds in ISO C mode [Andreas Lange]
4+
-- [email protected] pointed out fp_radix_size() had a few typos that affected correctness. Fixed.
5+
-- Added support for ECC performance, e.g. define "-DTFM_ALREADY_SET -DTFM_ECC192" and it will disable
6+
all of the unrolled code EXCEPT what is required for ECC P-192. It autodetects 32/64-bit platforms too. It's super neato.
7+
Support for 192, 224, 256, 384 and 521 bit curves through the defines [see tfm.h]
8+
-- AVR32 support added, define TFM_AVR32 to enable
9+
110
April 4th, 2006
211
0.09 -- Bruce Guenter suggested I use --tag=CC for libtool builds where the compiler may think it's C++.
312
-- Added support for k=1 in exptmod for RSA exponents. Makes it more competitive with other libraries

comba_mont_gen.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ int main(void)
55
int x, y, z;
66

77
printf(
8-
#if 0
8+
#if 1
99
"#ifdef TFM_SMALL_SET\n"
1010
"/* computes x/R == x (mod N) via Montgomery Reduction */\n"
1111
"void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)\n"
@@ -34,7 +34,7 @@ printf(
3434
"\n"
3535
" switch (pa) {\n");
3636

37-
for (x = 1; x <= 64; x++) {
37+
for (x = 1; x <= 16; x++) {
3838
if (x > 16 && (x != 32 && x != 48 && x != 64)) continue;
3939
if (x > 16) printf("#ifdef TFM_HUGE\n");
4040

@@ -99,7 +99,7 @@ if (x > 16) printf("#endif /* TFM_HUGE */\n");
9999

100100
}
101101

102-
#if 0
102+
#if 1
103103

104104
printf(
105105
" }\n"

comba_mult_gen.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ printf(
2626
" memcpy(at+%d, B->dp, %d * sizeof(fp_digit));\n"
2727
" COMBA_START;\n"
2828
"\n"
29-
" COMBA_CLEAR;\n", N, N+N, N, N, N, N);
29+
" COMBA_CLEAR;\n", N, N+N, N, N, N);
3030

3131
/* now do the rows */
3232
for (x = 0; x < (N+N-1); x++) {
@@ -53,7 +53,7 @@ printf(
5353
" C->sign = A->sign ^ B->sign;\n"
5454
" fp_clamp(C);\n"
5555
" COMBA_FINI;\n"
56-
"}\n\n\n", N+N-1, N+N, N+N);
56+
"}\n\n\n", N+N-1, N+N);
5757

5858
return 0;
5959
}

comba_sqr_gen.c

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,15 @@
88
* Tom St Denis, [email protected]
99
*/
1010

11-
/* Generates squaring comba code... it learns it knows our secrets! */
1211
#include <stdio.h>
1312

1413
int main(int argc, char **argv)
1514
{
1615
int x, y, z, N, f;
1716
N = atoi(argv[1]);
1817

19-
if (N >= 16 && N < 32) printf("#ifdef TFM_LARGE\n");
20-
if (N >= 32) printf("#ifdef TFM_HUGE\n");
21-
2218
printf(
19+
"#ifdef TFM_SQR%d\n"
2320
"void fp_sqr_comba%d(fp_int *A, fp_int *B)\n"
2421
"{\n"
2522
" fp_digit *a, b[%d], c0, c1, c2, sc0, sc1, sc2;\n"
@@ -32,7 +29,7 @@ printf(
3229
"\n"
3330
" /* output 0 */\n"
3431
" SQRADD(a[0],a[0]);\n"
35-
" COMBA_STORE(b[0]);\n", N, N+N);
32+
" COMBA_STORE(b[0]);\n", N, N, N+N);
3633

3734
for (x = 1; x < N+N-1; x++) {
3835
printf(
@@ -94,9 +91,7 @@ printf(
9491
" B->sign = FP_ZPOS;\n"
9592
" memcpy(B->dp, b, %d * sizeof(fp_digit));\n"
9693
" fp_clamp(B);\n"
97-
"}\n\n\n", N+N, N+N);
98-
99-
if (N >= 16) printf("#endif\n");
94+
"}\n#endif\n\n\n", N+N, N+N);
10095

10196
return 0;
10297
}

demo/test.c

Lines changed: 202 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ static ulong64 TIMFUNC (void)
3434
unsigned long a, b;
3535
__asm__ __volatile__ ("mftbu %1 \nmftb %0\n":"=r"(a), "=r"(b));
3636
return (((ulong64)b) << 32ULL) | ((ulong64)a);
37+
#elif defined(TFM_AVR32)
38+
FILE *in;
39+
char buf[20];
40+
in = fopen("/sys/devices/system/cpu/cpu0/pccycles", "r");
41+
fgets(buf, 20, in);
42+
fclose(in);
43+
return strtoul(buf, NULL, 10);
3744
#else /* gcc-IA64 version */
3845
unsigned long result;
3946
__asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
@@ -213,7 +220,7 @@ t1 = TIMFUNC();
213220
sleep(1);
214221
printf("Ticks per second: %llu\n", TIMFUNC() - t1);
215222

216-
goto monttime;
223+
goto multtime;
217224
/* do some timings... */
218225
printf("Addition:\n");
219226
for (t = 2; t <= FP_SIZE/2; t += 2) {
@@ -240,7 +247,7 @@ goto monttime;
240247
}
241248
multtime:
242249
printf("Multiplication:\n");
243-
for (t = 2; t <= FP_SIZE/2; t += 2) {
250+
for (t = 2; t < FP_SIZE/2; t += 2) {
244251
fp_zero(&a);
245252
fp_zero(&b);
246253
fp_zero(&c);
@@ -251,31 +258,155 @@ goto monttime;
251258
a.used = t;
252259
b.used = t;
253260
t2 = -1;
254-
for (ix = 0; ix < 10000; ++ix) {
261+
for (ix = 0; ix < 100; ++ix) {
255262
t1 = TIMFUNC();
256263
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
257264
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
258-
t2 = (TIMFUNC() - t1)>>2;
265+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
266+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
267+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
268+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
269+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
270+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
271+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
272+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
273+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
274+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
275+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
276+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
277+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
278+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
279+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
280+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
281+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
282+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
283+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
284+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
285+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
286+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
287+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
288+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
289+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
290+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
291+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
292+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
293+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
294+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
295+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
296+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
297+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
298+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
299+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
300+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
301+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
302+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
303+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
304+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
305+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
306+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
307+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
308+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
309+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
310+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
311+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
312+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
313+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
314+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
315+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
316+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
317+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
318+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
319+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
320+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
321+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
322+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
323+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
324+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
325+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
326+
fp_mul(&a, &b, &c); fp_mul(&a, &b, &c);
327+
t2 = (TIMFUNC() - t1)>>7;
259328
if (t1<t2) { --ix; t2 = t1; }
260329
}
261330
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
262331
}
263332
//#else
264333
sqrtime:
265334
printf("Squaring:\n");
266-
for (t = 2; t <= FP_SIZE/2; t += 2) {
335+
for (t = 2; t < FP_SIZE/2; t += 2) {
267336
fp_zero(&a);
268337
fp_zero(&b);
269338
for (ix = 0; ix < t; ix++) {
270339
a.dp[ix] = ix;
271340
}
272341
a.used = t;
273342
t2 = -1;
274-
for (ix = 0; ix < 10000; ++ix) {
343+
for (ix = 0; ix < 100; ++ix) {
275344
t1 = TIMFUNC();
276345
fp_sqr(&a, &b); fp_sqr(&a, &b);
277346
fp_sqr(&a, &b); fp_sqr(&a, &b);
278-
t2 = (TIMFUNC() - t1)>>2;
347+
fp_sqr(&a, &b); fp_sqr(&a, &b);
348+
fp_sqr(&a, &b); fp_sqr(&a, &b);
349+
fp_sqr(&a, &b); fp_sqr(&a, &b);
350+
fp_sqr(&a, &b); fp_sqr(&a, &b);
351+
fp_sqr(&a, &b); fp_sqr(&a, &b);
352+
fp_sqr(&a, &b); fp_sqr(&a, &b);
353+
fp_sqr(&a, &b); fp_sqr(&a, &b);
354+
fp_sqr(&a, &b); fp_sqr(&a, &b);
355+
fp_sqr(&a, &b); fp_sqr(&a, &b);
356+
fp_sqr(&a, &b); fp_sqr(&a, &b);
357+
fp_sqr(&a, &b); fp_sqr(&a, &b);
358+
fp_sqr(&a, &b); fp_sqr(&a, &b);
359+
fp_sqr(&a, &b); fp_sqr(&a, &b);
360+
fp_sqr(&a, &b); fp_sqr(&a, &b);
361+
fp_sqr(&a, &b); fp_sqr(&a, &b);
362+
fp_sqr(&a, &b); fp_sqr(&a, &b);
363+
fp_sqr(&a, &b); fp_sqr(&a, &b);
364+
fp_sqr(&a, &b); fp_sqr(&a, &b);
365+
fp_sqr(&a, &b); fp_sqr(&a, &b);
366+
fp_sqr(&a, &b); fp_sqr(&a, &b);
367+
fp_sqr(&a, &b); fp_sqr(&a, &b);
368+
fp_sqr(&a, &b); fp_sqr(&a, &b);
369+
fp_sqr(&a, &b); fp_sqr(&a, &b);
370+
fp_sqr(&a, &b); fp_sqr(&a, &b);
371+
fp_sqr(&a, &b); fp_sqr(&a, &b);
372+
fp_sqr(&a, &b); fp_sqr(&a, &b);
373+
fp_sqr(&a, &b); fp_sqr(&a, &b);
374+
fp_sqr(&a, &b); fp_sqr(&a, &b);
375+
fp_sqr(&a, &b); fp_sqr(&a, &b);
376+
fp_sqr(&a, &b); fp_sqr(&a, &b);
377+
fp_sqr(&a, &b); fp_sqr(&a, &b);
378+
fp_sqr(&a, &b); fp_sqr(&a, &b);
379+
fp_sqr(&a, &b); fp_sqr(&a, &b);
380+
fp_sqr(&a, &b); fp_sqr(&a, &b);
381+
fp_sqr(&a, &b); fp_sqr(&a, &b);
382+
fp_sqr(&a, &b); fp_sqr(&a, &b);
383+
fp_sqr(&a, &b); fp_sqr(&a, &b);
384+
fp_sqr(&a, &b); fp_sqr(&a, &b);
385+
fp_sqr(&a, &b); fp_sqr(&a, &b);
386+
fp_sqr(&a, &b); fp_sqr(&a, &b);
387+
fp_sqr(&a, &b); fp_sqr(&a, &b);
388+
fp_sqr(&a, &b); fp_sqr(&a, &b);
389+
fp_sqr(&a, &b); fp_sqr(&a, &b);
390+
fp_sqr(&a, &b); fp_sqr(&a, &b);
391+
fp_sqr(&a, &b); fp_sqr(&a, &b);
392+
fp_sqr(&a, &b); fp_sqr(&a, &b);
393+
fp_sqr(&a, &b); fp_sqr(&a, &b);
394+
fp_sqr(&a, &b); fp_sqr(&a, &b);
395+
fp_sqr(&a, &b); fp_sqr(&a, &b);
396+
fp_sqr(&a, &b); fp_sqr(&a, &b);
397+
fp_sqr(&a, &b); fp_sqr(&a, &b);
398+
fp_sqr(&a, &b); fp_sqr(&a, &b);
399+
fp_sqr(&a, &b); fp_sqr(&a, &b);
400+
fp_sqr(&a, &b); fp_sqr(&a, &b);
401+
fp_sqr(&a, &b); fp_sqr(&a, &b);
402+
fp_sqr(&a, &b); fp_sqr(&a, &b);
403+
fp_sqr(&a, &b); fp_sqr(&a, &b);
404+
fp_sqr(&a, &b); fp_sqr(&a, &b);
405+
fp_sqr(&a, &b); fp_sqr(&a, &b);
406+
fp_sqr(&a, &b); fp_sqr(&a, &b);
407+
fp_sqr(&a, &b); fp_sqr(&a, &b);
408+
fp_sqr(&a, &b); fp_sqr(&a, &b);
409+
t2 = (TIMFUNC() - t1)>>7;
279410
if (t1<t2) { --ix; t2 = t1; }
280411
}
281412
printf("%5lu-bit: %9llu\n", t * DIGIT_BIT, t2);
@@ -298,11 +429,73 @@ goto monttime;
298429
fp_copy(&b, &d);
299430

300431
t2 = -1;
301-
for (ix = 0; ix < 10000; ++ix) {
432+
for (ix = 0; ix < 100; ++ix) {
302433
t1 = TIMFUNC();
303434
fp_montgomery_reduce(&c, &a, &fp);
304435
fp_montgomery_reduce(&d, &a, &fp);
305-
t2 = (TIMFUNC() - t1)>>1;
436+
fp_montgomery_reduce(&c, &a, &fp);
437+
fp_montgomery_reduce(&d, &a, &fp);
438+
fp_montgomery_reduce(&c, &a, &fp);
439+
fp_montgomery_reduce(&d, &a, &fp);
440+
fp_montgomery_reduce(&c, &a, &fp);
441+
fp_montgomery_reduce(&d, &a, &fp);
442+
fp_montgomery_reduce(&c, &a, &fp);
443+
fp_montgomery_reduce(&d, &a, &fp);
444+
fp_montgomery_reduce(&c, &a, &fp);
445+
fp_montgomery_reduce(&d, &a, &fp);
446+
fp_montgomery_reduce(&c, &a, &fp);
447+
fp_montgomery_reduce(&d, &a, &fp);
448+
fp_montgomery_reduce(&c, &a, &fp);
449+
fp_montgomery_reduce(&d, &a, &fp);
450+
fp_montgomery_reduce(&c, &a, &fp);
451+
fp_montgomery_reduce(&d, &a, &fp);
452+
fp_montgomery_reduce(&c, &a, &fp);
453+
fp_montgomery_reduce(&d, &a, &fp);
454+
fp_montgomery_reduce(&c, &a, &fp);
455+
fp_montgomery_reduce(&d, &a, &fp);
456+
fp_montgomery_reduce(&c, &a, &fp);
457+
fp_montgomery_reduce(&d, &a, &fp);
458+
fp_montgomery_reduce(&c, &a, &fp);
459+
fp_montgomery_reduce(&d, &a, &fp);
460+
fp_montgomery_reduce(&c, &a, &fp);
461+
fp_montgomery_reduce(&d, &a, &fp);
462+
fp_montgomery_reduce(&c, &a, &fp);
463+
fp_montgomery_reduce(&d, &a, &fp);
464+
fp_montgomery_reduce(&c, &a, &fp);
465+
fp_montgomery_reduce(&d, &a, &fp);
466+
fp_montgomery_reduce(&c, &a, &fp);
467+
fp_montgomery_reduce(&d, &a, &fp);
468+
fp_montgomery_reduce(&c, &a, &fp);
469+
fp_montgomery_reduce(&d, &a, &fp);
470+
fp_montgomery_reduce(&c, &a, &fp);
471+
fp_montgomery_reduce(&d, &a, &fp);
472+
fp_montgomery_reduce(&c, &a, &fp);
473+
fp_montgomery_reduce(&d, &a, &fp);
474+
fp_montgomery_reduce(&c, &a, &fp);
475+
fp_montgomery_reduce(&d, &a, &fp);
476+
fp_montgomery_reduce(&c, &a, &fp);
477+
fp_montgomery_reduce(&d, &a, &fp);
478+
fp_montgomery_reduce(&c, &a, &fp);
479+
fp_montgomery_reduce(&d, &a, &fp);
480+
fp_montgomery_reduce(&c, &a, &fp);
481+
fp_montgomery_reduce(&d, &a, &fp);
482+
fp_montgomery_reduce(&c, &a, &fp);
483+
fp_montgomery_reduce(&d, &a, &fp);
484+
fp_montgomery_reduce(&c, &a, &fp);
485+
fp_montgomery_reduce(&d, &a, &fp);
486+
fp_montgomery_reduce(&c, &a, &fp);
487+
fp_montgomery_reduce(&d, &a, &fp);
488+
fp_montgomery_reduce(&c, &a, &fp);
489+
fp_montgomery_reduce(&d, &a, &fp);
490+
fp_montgomery_reduce(&c, &a, &fp);
491+
fp_montgomery_reduce(&d, &a, &fp);
492+
fp_montgomery_reduce(&c, &a, &fp);
493+
fp_montgomery_reduce(&d, &a, &fp);
494+
fp_montgomery_reduce(&c, &a, &fp);
495+
fp_montgomery_reduce(&d, &a, &fp);
496+
fp_montgomery_reduce(&c, &a, &fp);
497+
fp_montgomery_reduce(&d, &a, &fp);
498+
t2 = (TIMFUNC() - t1)>>6;
306499
fp_copy(&b, &c);
307500
fp_copy(&b, &d);
308501
if (t1<t2) { --ix; t2 = t1; }

doc/tfm.pdf

3.84 KB
Binary file not shown.

fp_ident.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,25 @@ const char *fp_ident(void)
4242
#ifdef TFM_ARM
4343
" TFM_ARM "
4444
#endif
45+
#ifdef TFM_PPC32
46+
" TFM_PPC32 "
47+
#endif
48+
#ifdef TFM_AVR32
49+
" TFM_AVR32 "
50+
#endif
51+
#ifdef TFM_ECC192
52+
" TFM_ECC192 "
53+
#endif
54+
#ifdef TFM_ECC224
55+
" TFM_ECC224 "
56+
#endif
57+
#ifdef TFM_ECC384
58+
" TFM_ECC384 "
59+
#endif
60+
#ifdef TFM_ECC521
61+
" TFM_ECC521 "
62+
#endif
63+
4564
#ifdef TFM_NO_ASM
4665
" TFM_NO_ASM "
4766
#endif

0 commit comments

Comments
 (0)