diff -Nru gmp-ecm-7.0.4+ds/acinclude.m4 gmp-ecm-7.0.5+ds/acinclude.m4 --- gmp-ecm-7.0.4+ds/acinclude.m4 2016-08-23 12:25:18.000000000 +0000 +++ gmp-ecm-7.0.5+ds/acinclude.m4 2022-06-06 14:16:49.000000000 +0000 @@ -292,8 +292,10 @@ [cat >conftes1.c <conftes2.s < compiles with $NVCC $flags + +m4_define([NVCC_CHECK_COMPILE], +[ + echo "$1" > conftest.cu + $NVCC -c conftest.cu -o conftest.o $2 &> /dev/null + ret=$? + rm conftest.cu + AS_IF([test "$ret" -eq "0"], [$3], [$4]) +]) + dnl CU_CHECK_CUDA dnl Check if a GPU version is asked, for which GPU and where CUDA is install. dnl Includes are put in CUDA_INC_FLAGS @@ -376,12 +391,20 @@ AC_DEFUN([CU_CHECK_CUDA], [ -# Is the GPU version is requested? +# Is the GPU version requested? AC_ARG_ENABLE(gpu, - AS_HELP_STRING([--enable-gpu=GPU_ARCH], - [Enable the cuda version [default=no]]), - [ AS_IF([test "x$enableval" = "xno"], [ enable_gpu="no" ], - [ enable_gpu="yes" ]) ] ) + AS_HELP_STRING([--enable-gpu@<:@=GPU_ARCH@:>@], + [Build with support for CUDA stage 1, by default builds with all possible compute capabilities + to build with a single compute capability pass use --enable-gpu=XX [default=no]]), + [ AS_IF([test "x$enableval" = "xno"], + [ enable_gpu="no" ], + [ enable_gpu="yes" + AS_CASE(["x$enableval"], + [ xyes ], [], + [ x[[2-9]][[0-9]] ], [ WANTED_GPU_ARCH="$enableval" ], + [ AC_MSG_ERROR([Didn't recognize GPU_ARCH="$enableval"]) ]) + ]) ]) + AC_ARG_WITH(cuda, AS_HELP_STRING([--with-cuda=DIR], @@ -522,67 +545,98 @@ [NVCCFLAGS=" --compiler-bindir $cuda_compiler NVCCFLAGS"]) dnl check that gcc version is compatible with nvcc version - touch conftest.cu + dnl (seth) How is this checking if gcc and nvcc are compatible? AC_MSG_CHECKING([for compatibility between gcc and nvcc]) - $NVCC -c conftest.cu -o conftest.o $NVCCFLAGS > /dev/null 2>&1 - AS_IF([test "$?" -eq "0"], + NVCC_CHECK_COMPILE([], [$NVCCFLAGS], + [AC_MSG_RESULT([yes])], [ - AC_MSG_RESULT([yes]) - ], [ AC_MSG_RESULT([no]) AC_MSG_ERROR(gcc version is not compatible with nvcc) ]) - dnl Check which GPU architecture nvcc know - NVCCTEST="$NVCC -c conftest.cu -o conftest.o $NVCCFLAGS --dryrun" + dnl Check which GPU architecture nvcc knows GPU_ARCH="" - m4_foreach_w([compute_compatibility], [20 21 30 32 35 37 50 52 53], + m4_foreach_w([compute_capability], [30 32 35 37 50 52 53 60 61 62 70 72 75 80 86 87 90], [ - testcc=compute_compatibility - AC_MSG_CHECKING([that nvcc know compute capability $testcc]) - AS_IF([test "$testcc" -eq "21"], - [ - NEW="--generate-code arch=compute_20,code=sm_21" - ], + testcc=compute_capability + AS_IF([test -z "$WANTED_GPU_ARCH" -o "$WANTED_GPU_ARCH" = "$testcc"], [ + AC_MSG_CHECKING([that nvcc know compute capability $testcc]) NEW="--generate-code arch=compute_$testcc,code=sm_$testcc" + NVCC_CHECK_COMPILE([], [$NVCCFLAGS --dryrun $NEW], + [ + AC_MSG_RESULT([yes]) + GPU_ARCH="$GPU_ARCH $NEW" + MIN_CC=${MIN_CC:-$testcc} + ], [ + AC_MSG_RESULT([no]) + ]) ]) - $NVCCTEST $NEW > /dev/null 2>&1 - AS_IF([test "$?" -eq "0"], - [ - AC_MSG_RESULT([yes]) - GPU_ARCH="$GPU_ARCH $NEW" - ], [ - AC_MSG_RESULT([no]) - ]) - ] ) + ]) + # Use JIT compilation of GPU code for forward compatibility - GPU_ARCH="--generate-code arch=compute_20,code=compute_20 $GPU_ARCH" + AC_MSG_NOTICE([Setting MIN_CC=$MIN_CC GPU_ARCH=$GPU_ARCH]) + + AS_IF([test -z "$GPU_ARCH"], + [AC_MSG_ERROR([No supported compute capabilities found])]) dnl check that nvcc know ptx instruction madc - echo "__global__ void test (int *a, int b) { - asm(\"mad.lo.cc.u32 %0, %0, %1, %1;\": - \"+r\"(*a) : \"r\"(b));} " > conftest.cu - AC_MSG_CHECKING([if nvcc know ptx instruction madc]) - $NVCC -c conftest.cu -o conftest.o $NVCCFLAGS --generate-code arch=compute_20,code=compute_20 > /dev/null 2>&1 - AS_IF([test "$?" -eq "0"], + AC_MSG_CHECKING([if nvcc knows ptx instruction madc]) + NVCC_CHECK_COMPILE( + [ + __global__ void test (int *a, int b) { + asm(\"mad.lo.cc.u32 %0, %0, %1, %1;\": + \"+r\"(*a) : \"r\"(b));} + ], + [$NVCCFLAGS --generate-code arch=compute_${MIN_CC},code=compute_${MIN_CC}], + [AC_MSG_RESULT([yes])], [ - AC_MSG_RESULT([yes]) - ], [ AC_MSG_RESULT([no]) AC_MSG_ERROR([nvcc does not recognize ptx instruction madc, you should upgrade it]) ]) + AC_ARG_WITH(cgbn_include, + AS_HELP_STRING([--with-cgbn-include=DIR], [CGBN include directory]), + [ + cgbn_include=$withval + AC_MSG_NOTICE([Using CGBN from $cgbn_include]) + AS_IF([test "x$with_cgbn_include" != "xno"], + [ + AS_IF([test -d "$cgbn_include"], + [], + [AC_MSG_ERROR([Specified CGBN include directory "$cgbn_include" does not exist])]) + + AC_MSG_CHECKING([if CGBN is present]) + + dnl AC_CHECK_HEADER can't verify NVCC compilability hence NVCC_CHECK_COMPILE + NVCC_CHECK_COMPILE( + [ + #include + #include + ], + [-I$cgbn_include $GMPLIB], + [AC_MSG_RESULT([yes])], + [ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([cgbn.h not found (check if /cgbn needed after /include)]) + ] + ) + AC_DEFINE([HAVE_CGBN_H], [1], [Define to 1 if cgbn.h exists]) + NVCCFLAGS="-I$with_cgbn_include $GMPLIB $NVCCFLAGS" + want_cgbn="yes" + ]) + ]) + LIBS="$LIBS_BACKUP" LDFLAGS="$LDFLAGS_BACKUP" - - NVCCFLAGS="$NVCCFLAGS -DWITH_GPU $GPU_ARCH" + + NVCCFLAGS="$NVCCFLAGS $GPU_ARCH" CFLAGS="$CFLAGS -DWITH_GPU" CPPFLAGS="$CPPFLAGS -DWITH_GPU" NVCCFLAGS="$NVCCFLAGS --ptxas-options=-v" NVCCFLAGS="$NVCCFLAGS --compiler-options -fno-strict-aliasing" - # If debug flag is set apply debugging compilation flags, + # If debug flag is set apply debugging compilation flags, # otherwise build compilation flags AS_IF([test "x$DEBUG" = "xtrue"], [ @@ -596,6 +650,8 @@ ]) #Set this conditional if cuda is wanted AM_CONDITIONAL([WANT_GPU], [test "x$enable_gpu" = "xyes" ]) +#Set this conditional if cuda & cgbn_include +AM_CONDITIONAL([WANT_CGBN], [test "x$want_cgbn" = "xyes" ]) AC_SUBST(NVCC) AC_SUBST(NVCCFLAGS) @@ -604,32 +660,3 @@ AC_SUBST(CUDARPATH) ]) - -dnl Checks whether the stack can be marked nonexecutable by passing an option -dnl to the C-compiler when acting on .s files. Appends that option to ASMFLAGS. -dnl This macro is adapted from one found in GMP 6.1.1. -dnl FIXME: This test looks broken. It tests that a file with .note.GNU-stack... -dnl can be compiled/assembled with -Wa,--noexecstack. It does not determine -dnl if that command-line option has any effect on general asm code. -AC_DEFUN([CL_AS_NOEXECSTACK],[ -dnl AC_REQUIRE([AC_PROG_CC]) GMP uses something else -AC_CACHE_CHECK([whether assembler supports --noexecstack option], -cl_cv_as_noexecstack, [dnl - cat > conftest.c </dev/null]) \ - && grep .note.GNU-stack conftest.s >/dev/null \ - && AC_TRY_COMMAND([${CC} $CFLAGS $CPPFLAGS -Wa,--noexecstack - -c -o conftest.o conftest.s >/dev/null]) - then - cl_cv_as_noexecstack=yes - else - cl_cv_as_noexecstack=no - fi - rm -f conftest*]) - if test "$cl_cv_as_noexecstack" = yes; then - LIBECM_LDFLAGS="$LIBECM_LDFLAGS -Wl,-znoexecstack" - fi -]) diff -Nru gmp-ecm-7.0.4+ds/addlaws.c gmp-ecm-7.0.5+ds/addlaws.c --- gmp-ecm-7.0.4+ds/addlaws.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/addlaws.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,1301 @@ +/* addlaws.c - various addition laws for ECM + Author: F. Morain +*/ + +#include +#include +#include +#include + +#include /* GMP header file */ + +#include "ecm.h" /* ecm header file */ +#include "ecm-impl.h" +#include "ecm-ecm.h" +#include "mpmod.h" + +#include "addlaws.h" + +#if DEBUG_ADD_LAWS >= 1 +void +print_mpz_from_mpres(mpres_t x, mpmod_t n) +{ + mpz_t tmp; + + mpz_init(tmp); + mpres_get_z(tmp, x, n); + gmp_printf("%Zd", tmp); + mpz_clear(tmp); +} +#endif + +/******************** Weierstrass section ********************/ + +void +pt_w_set_to_zero(ell_point_t P, mpmod_t n) +{ + mpres_set_ui(P->x, 0, n); + mpres_set_ui(P->y, 1, n); + mpres_set_ui(P->z, 0, n); +} + +int +pt_w_is_zero(mpres_t z, mpmod_t n) +{ + return mpres_is_zero(z, n); +} + +void +pt_w_set(mpres_t x0, mpres_t y0, mpres_t z0, + mpres_t x, mpres_t y, mpres_t z, + ATTRIBUTE_UNUSED mpmod_t n) +{ + mpres_set(x0, x, n); + mpres_set(y0, y, n); + mpres_set(z0, z, n); +} + +#if DEBUG_ADD_LAWS >= 1 +void +pt_w_print(mpres_t x, mpres_t y, mpres_t z, ell_curve_t E, mpmod_t n) +{ + printf("["); + print_mpz_from_mpres(x, n); + printf(", "); + print_mpz_from_mpres(y, n); + printf(", "); + if(E->type == ECM_EC_TYPE_WEIERSTRASS && E->law == ECM_LAW_AFFINE) + gmp_printf("%Zd", z); + else + print_mpz_from_mpres(z, n); + printf("]"); +} +#endif + +/* [x0, y0, z0] <- [x1, y1, z1] + [x2, y2, z2] using lambda=num/den + with buffer inv. + + (lambda*x+mu)^2+a1*x*(lambda*x+mu)+a3*(lambda*x+mu)=x^3+a2*x^2+... + x^3+(a2-lambda^2-a1*lambda)*x^2+... = 0 + x1+x2+x3 = lambda^2+a1*lambda-a2. + y3 = lambda*(x1-x3)-y1-a1*x3-a3 + */ +static int +pt_w_common_aff(mpz_t f, mpres_t x0, mpres_t y0, mpres_t z0, + mpres_t x1, mpres_t y1, + mpres_t x2, mpres_t a1, mpres_t a3, mpres_t a2, + mpmod_t n, mpres_t num, mpres_t den, mpres_t lambda) +{ + if(mpres_invert(lambda, den, n) == 0){ + mpres_gcd(f, den, n); + return 0; + } + /** lambda = num/den **/ + mpres_mul(lambda, lambda, num, n); + /** num <- (lambda+a1)*lambda **/ + mpres_add(num, lambda, a1, n); + mpres_mul(num, num, lambda, n); + mpres_sub(num, num, a2, n); + /** x0 = den <- num-x1-x2 **/ + mpres_sub(den, num, x1, n); + mpres_sub(den, den, x2, n); + /** y0 = num <- lambda*(x1-x0)-(y1+a1*x0+a3) **/ + mpres_sub(num, x1, den, n); + mpres_mul(num, num, lambda, n); + mpres_sub(y0, num, y1, n); + mpres_sub(y0, y0, a3, n); + mpres_mul(x0, a1, den, n); + mpres_sub(y0, y0, x0, n); + /** finish **/ + mpres_set(x0, den, n); + mpz_set_ui(z0, 1); /* just in case */ + return 1; +} + +/* [x3, y3, z3] <- [2] * [x1, y1, z1] */ +int +pt_w_duplicate(mpz_t f, mpres_t x3, mpres_t y3, mpres_t z3, + mpres_t x1, mpres_t y1, mpres_t z1, + mpmod_t n, ell_curve_t E) +{ + if(pt_w_is_zero(z1, n) == 1){ + pt_w_set(x3, y3, z3, x1, y1, z1, n); + return 1; + } + if(E->type == ECM_EC_TYPE_WEIERSTRASS && E->law == ECM_LAW_AFFINE){ + /* buf[1] <- 2*y1+a1*x1+a3 */ + mpres_mul(E->buf[1], E->a1, x1, n); + mpres_add(E->buf[1], E->buf[1], E->a3, n); + mpres_add(E->buf[1], E->buf[1], y1, n); + mpres_add(E->buf[1], E->buf[1], y1, n); + if(mpres_is_zero(E->buf[1], n)){ + /* buf1 = 0 <=> P is a [2]-torsion point */ + mpres_set_ui(x3, 0, n); + mpres_set_ui(y3, 1, n); + mpres_set_ui(z3, 0, n); + return 1; + } + /* buf[0] <- 3*x^2+2*a2*x+a4-a1*y = (3*x+2*a2)*x+a4-a1*y */ + mpres_mul_ui(E->buf[0], x1, 3, n); + mpres_add(E->buf[0], E->buf[0], E->a2, n); + mpres_add(E->buf[0], E->buf[0], E->a2, n); + mpres_mul(E->buf[0], E->buf[0], x1, n); + mpres_add(E->buf[0], E->buf[0], E->a4, n); + mpres_mul(E->buf[2], E->a1, y1, n); + mpres_sub(E->buf[0], E->buf[0], E->buf[2], n); + return pt_w_common_aff(f, x3, y3, z3, x1, y1, x1, + E->a1, E->a3, E->a2, n, + E->buf[0], E->buf[1], E->buf[2]); + } + else if(E->type == ECM_EC_TYPE_WEIERSTRASS + && E->law == ECM_LAW_HOMOGENEOUS){ + /* source is dbl-2007-bl: 5M + 6S + 1*a + 7add + 3*2 + 1*3 */ + /* mapping: h = buf[0], w = buf[1], s = buf[2], RR = buf[3], B = buf[4];*/ + /* h:=X1^2 mod p; # S*/ + mpres_sqr(E->buf[0], x1, n); + /* w:=Z1^2 mod p;*/ + mpres_sqr(E->buf[1], z1, n); + /* w:=a*w mod p;*/ + mpres_mul(E->buf[1], E->buf[1], E->a4, n); + /* s:=3*h mod p; # *3*/ + mpres_mul_ui(E->buf[2], E->buf[0], 3, n); + /* w:=w+s mod p;*/ + mpres_add(E->buf[1], E->buf[1], E->buf[2], n); + /* s:=Y1*Z1 mod p;*/ + mpres_mul(E->buf[2], y1, z1, n); + /* s:=2*s mod p;*/ + mpres_mul_ui(E->buf[2], E->buf[2], 2, n); + /* Z3:=s^2 mod p;*/ + mpres_sqr(z3, E->buf[2], n); + /* Z3:=s*Z3 mod p;*/ + mpres_mul(z3, z3, E->buf[2], n); + /* RR:=Y1*s mod p; # M*/ + mpres_mul(E->buf[3], y1, E->buf[2], n); + /* B:=X1+RR mod p; # add*/ + mpres_add(E->buf[4], x1, E->buf[3], n); + /* B:=B^2 mod p;*/ + mpres_sqr(E->buf[4], E->buf[4], n); + /* RR:=RR^2 mod p; # S*/ + mpres_sqr(E->buf[3], E->buf[3], n); + /* B:=B-h mod p;*/ + mpres_sub(E->buf[4], E->buf[4], E->buf[0], n); + /* B:=B-RR mod p;*/ + mpres_sub(E->buf[4], E->buf[4], E->buf[3], n); + /* h:=w^2 mod p;*/ + mpres_sqr(E->buf[0], E->buf[1], n); + /* X3:=2*B mod p;*/ + mpres_mul_ui(x3, E->buf[4], 2, n); + /* h:=h-X3 mod p;*/ + mpres_sub(E->buf[0], E->buf[0], x3, n); + /* X3:=h*s mod p; # M*/ + mpres_mul(x3, E->buf[0], E->buf[2], n); + /* s:=B-h mod p;*/ + mpres_sub(E->buf[2], E->buf[4], E->buf[0], n); + /* s:=w*s mod p;*/ + mpres_mul(E->buf[2], E->buf[2], E->buf[1], n); + /* Y3:=2*RR mod p;*/ + mpres_mul_ui(y3, E->buf[3], 2, n); + /* Y3:=s-Y3 mod p;*/ + mpres_sub(y3, E->buf[2], y3, n); + return 1; + } + return 0; +} + +/* [x3, y3, z3] <- [x1, y1, z1] + [x2, y2, z2]; P3 can be either P1 or P2. */ +int +pt_w_add(mpz_t f, mpres_t x3, mpres_t y3, mpres_t z3, + mpres_t x1, mpres_t y1, mpres_t z1, + mpres_t x2, mpres_t y2, mpres_t z2, + mpmod_t n, ell_curve_t E) +{ + if(pt_w_is_zero(z1, n)){ + pt_w_set(x3, y3, z3, x2, y2, z2, n); + return 1; + } + else if(pt_w_is_zero(z2, n)){ + pt_w_set(x3, y3, z3, x1, y1, z1, n); + return 1; + } + if(E->type == ECM_EC_TYPE_WEIERSTRASS && E->law == ECM_LAW_AFFINE) + if(mpres_equal(x1, x2, n) && mpres_equal(y1, y2, n)) + return pt_w_duplicate(f, x3, y3, z3, x1, y1, z1, n, E); + else{ + mpres_sub(E->buf[0], y1, y2, n); + mpres_sub(E->buf[1], x1, x2, n); + return pt_w_common_aff(f, x3, y3, z3, x1, y1, x2, + E->a1, E->a3, E->a2, + n, E->buf[0], E->buf[1], E->buf[2]); + } + else if(E->type == ECM_EC_TYPE_WEIERSTRASS + && E->law == ECM_LAW_HOMOGENEOUS){ + /* Cohen-Miyaji-Ono: 12M+2S+6add+1*2 */ + /* mapping: y1z2 = buf, AA = buf+1, u = buf+2, v = buf+3, R = buf+4, */ + /* vvv = buf+5; */ +#if DEBUG_ADD_LAWS >= 2 + printf("y1="); print_mpz_from_mpres(y1, n); printf("\n"); + printf("y2="); print_mpz_from_mpres(y2, n); printf("\n"); + printf("z1="); print_mpz_from_mpres(z1, n); printf("\n"); + printf("z2="); print_mpz_from_mpres(z2, n); printf("\n"); +#endif + /* Y1Z2:=Y1*Z2 mod p; # M*/ + mpres_mul(E->buf[0], y1, z2, n); + /* A:=X1*Z2 mod p; # M*/ + mpres_mul(E->buf[1], x1, z2, n); + /* u:=Y2*Z1 mod p;*/ + mpres_mul(E->buf[2], y2, z1, n); + /* u:=u-Y1Z2 mod p;*/ + mpres_sub(E->buf[2], E->buf[2], E->buf[0], n); + /* v:=X2*Z1 mod p;*/ + mpres_mul(E->buf[3], x2, z1, n); + /* v:=v-A mod p;*/ + mpres_sub(E->buf[3], E->buf[3], E->buf[1], n); + if(mpz_sgn(E->buf[2]) == 0 && mpz_sgn(E->buf[3]) == 0){ + /* u = 0 <=> Y2*Z1 = Y1*Z2 <=> Y2/Z2 = Y1/Z1*/ + /* v = 0 <=> X2*Z1 = X1*Z2 <=> X2/Z2 = X1/Z1*/ + return pt_w_duplicate(f, x3, y3, z3, x1, y1, z1, n, E); + } + /* Z3:=Z1*Z2 mod p; # M*/ + mpres_mul(z3, z1, z2, n); + /* X3:=u^2 mod p;*/ + mpres_sqr(x3, E->buf[2], n); + /* X3:=X3*Z3 mod p;*/ + mpres_mul(x3, x3, z3, n); + /* R:=v^2 mod p;*/ + mpres_sqr(E->buf[4], E->buf[3], n); + /* vvv:=v*R mod p;*/ + mpres_mul(E->buf[5], E->buf[3], E->buf[4], n); + /* R:=R*A mod p;*/ + mpres_mul(E->buf[4], E->buf[4], E->buf[1], n); + /* Y3:=2*R mod p; # *2*/ + mpres_mul_ui(y3, E->buf[4], 2, n); + /* A:=X3-vvv mod p;*/ + mpres_sub(E->buf[1], x3, E->buf[5], n); + /* A:=A-Y3 mod p;*/ + mpres_sub(E->buf[1], E->buf[1], y3, n); + /* X3:=v*A mod p; # M*/ + mpres_mul(x3, E->buf[3], E->buf[1], n); + /* Y3:=R-A mod p;*/ + mpres_sub(y3, E->buf[4], E->buf[1], n); + /* Y3:=u*Y3 mod p;*/ + mpres_mul(y3, y3, E->buf[2], n); + /* A:=vvv*Y1Z2 mod p;*/ + mpres_mul(E->buf[1], E->buf[5], E->buf[0], n); + /* Y3:=Y3-A mod p;*/ + mpres_sub(y3, y3, E->buf[1], n); + /* Z3:=vvv*Z3 mod p; # M*/ + mpres_mul(z3, z3, E->buf[5], n); + return 1; + } + return 0; +} + +#if USE_ADD_SUB_CHAINS > 0 +/* [x3, y3, z3] <- [x1, y1, z1] - [x2, y2, z2]; P3 != P1, P3 != P2. + -P2 ~ -(x2/z2, y2/z2, 1) = (x2/z2, -y2/z2-a1*x/z2-a3, 1) + ~ (x2, -y2-a1*x2-a3*z2, z2). +*/ +int +pt_w_sub(mpz_t f, mpres_t x3, mpres_t y3, mpres_t z3, + mpres_t x1, mpres_t y1, mpres_t z1, + mpres_t x2, mpres_t y2, mpres_t z2, + mpmod_t n, ell_curve_t E) +{ + int res = 1; + + if(E->law == ECM_LAW_HOMOGENEOUS){ + /* FIXME: does not work for complete equation! */ + mpres_neg(y2, y2, n); + res = pt_w_add(f, x3, y3, z3, x1, y1, z1, x2, y2, z2, n, E); + mpres_neg(y2, y2, n); + } + else if(E->law == ECM_LAW_AFFINE){ + /* buf[3] not used in law, so use it */ + mpres_mul(E->buf[3], E->a1, x2, n); + mpres_add(E->buf[3], E->buf[3], E->a3, n); + mpres_add(E->buf[3], E->buf[3], y2, n); + mpres_neg(E->buf[3], E->buf[3], n); + res = pt_w_add(f, x3, y3, z3, x1, y1, z1, x2, E->buf[3], z2, n, E); + } + return res; +} +#endif + +/******************** projective Hessian form ********************/ + +/* U^3+V^3+W^3 = 3*D*U*V*W, D^3 <> 1. + O_H = [1:-1:0] + -[u:v:w] = [v:u:w] + Warning: there can exist two other points at infinity, namely + [1:-omega:0] and [1:-omega^2:0] where omega^3 = 1. +*/ +int +hessian_is_zero(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + mpres_t tmp; + int ret; + + if(mpz_sgn(P->z) != 0) + return 0; + mpres_init(tmp, n); + mpres_add(tmp, P->x, P->y, n); + ret = mpz_sgn(tmp) == 0; +#if 0 + if(ret) + gmp_printf("found a third root of unity? %Zd/%Zd\n", P->x, P->y); +#endif + mpres_clear(tmp, n); + return ret; +} + +void +hessian_set_to_zero(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + mpres_set_si(P->x, 1, n); + mpres_set_si(P->y, -1, n); + mpres_set_si(P->z, 0, n); +} + +#if DEBUG_ADD_LAWS >= 1 +void +hessian_print(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + pt_w_print(P->x, P->y, P->z, E, n); +} +#endif + +#if USE_ADD_SUB_CHAINS > 0 +/* -[u:v:w] = [v:u:w] */ +void +hessian_negate(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, ATTRIBUTE_UNUSED mpmod_t n) +{ + mpz_swap(P->x, P->y); /* humf */ +} +#endif + +/* TODO: decrease the number of buffers? */ +int +hessian_duplicate(ell_point_t R, ell_point_t P, + ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + /* A = buf[0], ..., G = buf[6], H = buf[7], J = buf[8] */ + /* A:=P[1]^2 mod N; */ + mpres_mul(E->buf[0], P->x, P->x, n); + /* B:=P[2]^2 mod N; */ + mpres_mul(E->buf[1], P->y, P->y, n); + /* C:=P[3]^2 mod N; */ + mpres_mul(E->buf[2], P->z, P->z, n); + /* D:=(A+B) mod N; */ + mpres_add(E->buf[3], E->buf[0], E->buf[1], n); + /* E:=(A+C) mod N; */ + mpres_add(E->buf[4], E->buf[0], E->buf[2], n); + /* F:=(B+C) mod N; */ + mpres_add(E->buf[5], E->buf[1], E->buf[2], n); + /* G:=((P[1]+P[2])^2-D) mod N; */ + mpres_add(E->buf[6], P->x, P->y, n); + mpres_mul(E->buf[6], E->buf[6], E->buf[6], n); + mpres_sub(E->buf[6], E->buf[6], E->buf[3], n); + /* H:=((P[1]+P[3])^2-E) mod N; */ + mpres_add(E->buf[7], P->x, P->z, n); + mpres_mul(E->buf[7], E->buf[7], E->buf[7], n); + mpres_sub(E->buf[7], E->buf[7], E->buf[4], n); + /* J:=((P[2]+P[3])^2-F) mod N; */ + mpres_add(E->buf[8], P->y, P->z, n); + mpres_mul(E->buf[8], E->buf[8], E->buf[8], n); + mpres_sub(E->buf[8], E->buf[8], E->buf[5], n); + /* R->x = ((J-G)*(H+2*E)) mod N */ + mpres_sub(E->buf[0], E->buf[8], E->buf[6], n); + mpres_add(E->buf[1], E->buf[7], E->buf[4], n); + mpres_add(E->buf[1], E->buf[1], E->buf[4], n); + mpres_mul(R->x, E->buf[0], E->buf[1], n); + /* R->y = ((G-H)*(J+2*F)) mod N */ + mpres_sub(E->buf[0], E->buf[6], E->buf[7], n); + mpres_add(E->buf[1], E->buf[8], E->buf[5], n); + mpres_add(E->buf[1], E->buf[1], E->buf[5], n); + mpres_mul(R->y, E->buf[0], E->buf[1], n); + /* R->z = ((H-J)*(G+2*D)) mod N */ + mpres_sub(E->buf[0], E->buf[7], E->buf[8], n); + mpres_add(E->buf[1], E->buf[6], E->buf[3], n); + mpres_add(E->buf[1], E->buf[1], E->buf[3], n); + mpres_mul(R->z, E->buf[0], E->buf[1], n); + return 1; +} + +/* TODO: reduce the number of buffers? */ +int +hessian_plus(ell_point_t R, ell_point_t P, ell_point_t Q, + ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + /* P = [T1,T2,T3], Q = [T4,T5,T6] */ + /* P = Q <=> T1/T3=T4/T6 and T2/T3=T5/T6 + <=> T1*T6=T3*T4 and T2*T6=T3*T5 + */ + /* T1 = buf[0], ..., T7 = buf[6] */ + /* T7:=(T1*T6) mod N; */ + mpres_mul(E->buf[6], P->x, Q->z, n); + /* T1:=(T1*T5) mod N; */ + mpres_mul(E->buf[0], P->x, Q->y, n); + /* T5:=(T3*T5) mod N; */ + mpres_mul(E->buf[4], P->z, Q->y, n); + /* T3:=(T3*T4) mod N; */ + mpres_mul(E->buf[2], P->z, Q->x, n); + /* T4:=(T2*T4) mod N; */ + mpres_mul(E->buf[3], P->y, Q->x, n); + /* T2:=(T2*T6) mod N; */ + mpres_mul(E->buf[1], P->y, Q->z, n); + + if(mpres_equal(E->buf[6], E->buf[2], n) + && mpres_equal(E->buf[4], E->buf[1], n)) + /* as a matter of that, P = Q and we need duplicate */ + return hessian_duplicate(R, P, E, n); + + /* T6:=(T2*T7) mod N; */ + mpres_mul(E->buf[5], E->buf[1], E->buf[6], n); + /* T2:=(T2*T4) mod N; */ + mpres_mul(E->buf[1], E->buf[1], E->buf[3], n); + /* T4:=(T3*T4) mod N; */ + mpres_mul(E->buf[3], E->buf[2], E->buf[3], n); + /* T3:=(T3*T5) mod N; */ + mpres_mul(E->buf[2], E->buf[2], E->buf[4], n); + /* T5:=(T1*T5) mod N; */ + mpres_mul(E->buf[4], E->buf[0], E->buf[4], n); + /* T1:=(T1*T7) mod N; */ + mpres_mul(E->buf[0], E->buf[0], E->buf[6], n); + /* T1:=(T1-T4) mod N; */ + mpres_sub(R->y, E->buf[0], E->buf[3], n); + /* T2:=(T2-T5) mod N; */ + mpres_sub(R->x, E->buf[1], E->buf[4], n); + /* T3:=(T3-T6) mod N; */ + mpres_sub(R->z, E->buf[2], E->buf[5], n); + /* return [T2, T1, T3]; */ + return 1; +} + +int +hessian_add(ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + if(hessian_is_zero(P, E, n)){ + ell_point_set(R, Q, E, n); + return 1; + } + else if(hessian_is_zero(Q, E, n)){ + ell_point_set(R, P, E, n); + return 1; + } + else + return hessian_plus(R, P, Q, E, n); +} + +#if USE_ADD_SUB_CHAINS > 0 +int +hessian_sub(ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + int ret; + + hessian_negate(Q, E, n); + ret = hessian_add(R, P, Q, E, n); + hessian_negate(Q, E, n); + return ret; +} +#endif + +/* switch from X^3+Y^3+1=3*D*X*Y to Y^2=X^3+A*X+B + A:=-27*D*(D^3+8); + B:=54*(D^6-20*D^3-8); + xi:=12*(D^3-1)/(D*u+v+1); + x:=-9*D^2+xi*u; + y:=3*xi*(v-1); + OUTPUT: If a factor is found during the inversion, it is put in f and + ECM_FACTOR_FOUND_STEP1 is returned. Otherwise, ECM_NO_FACTOR_FOUND is + returned. + SIDE-EFFECT: (x, y, D) <- (x_on_W, y_on_W, A_of_W) + */ +int +hessian_to_weierstrass(mpz_t f, mpres_t x, mpres_t y, mpres_t D, mpmod_t n) +{ + mpres_t D3, A, xi, tmp1, tmp2; + int ret = ECM_NO_FACTOR_FOUND; + +#if DEBUG_ADD_LAWS >= 1 + printf("P:=["); + print_mpz_from_mpres(x, n); + printf(", "); + print_mpz_from_mpres(y, n); + printf(", 1];\n"); + printf("D:="); + print_mpz_from_mpres(D, n); + printf(";\n"); +#endif + /* D3 <- D^3 */ + mpres_init(D3, n); + mpres_mul(D3, D, D, n); + mpres_mul(D3, D3, D, n); + /* finish A */ + mpres_init(A, n); + mpres_add_ui(A, D3, 8, n); + mpres_mul(A, A, D, n); + mpres_mul_ui(A, A, 27, n); + mpres_neg(A, A, n); + /* compute xi */ + mpres_init(xi, n); + mpres_init(tmp1, n); + mpres_mul(tmp1, D, x, n); + mpres_add(tmp1, tmp1, y, n); + mpres_add_ui(tmp1, tmp1, 1, n); + mpres_init(tmp2, n); + mpres_sub_ui(tmp2, D3, 1, n); + mpres_mul_ui(tmp2, tmp2, 12, n); + if(mpres_invert(xi, tmp1, n) == 0){ + mpres_gcd(f, tmp1, n); + ret = ECM_FACTOR_FOUND_STEP1; + } + else{ + mpres_mul(xi, xi, tmp2, n); + /* compute x */ + mpres_mul(tmp1, D, D, n); + mpres_mul_ui(tmp1, tmp1, 9, n); + mpres_mul(tmp2, xi, x, n); + mpres_sub(x, tmp2, tmp1, n); + /* compute y */ + mpres_sub_ui(tmp1, y, 1, n); + mpres_mul(tmp1, tmp1, xi, n); + mpres_mul_ui(y, tmp1, 3, n); + mpres_set(D, A, n); +#if DEBUG_ADD_LAWS >= 1 + printf("WP:=["); + print_mpz_from_mpres(x, n); + printf(", "); + print_mpz_from_mpres(y, n); + printf(", 1];\n"); + printf("WA:="); + print_mpz_from_mpres(D, n); + printf(";\nWB:=(WP[2]^2-WP[1]^3-WA*WP[1]) mod N;WE:=[WA, WB];\n"); +#endif + } + mpres_clear(A, n); + mpres_clear(D3, n); + mpres_clear(xi, n); + mpres_clear(tmp1, n); + mpres_clear(tmp2, n); + return ret; +} + +int +mult_by_3(mpz_t f, mpres_t x, mpres_t y, mpres_t A, mpmod_t n) +{ + ell_curve_t E; + ell_point_t P, Q; + int ret = ECM_NO_FACTOR_FOUND; + mpz_t e; + + ell_curve_init_set(E, ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE, A, n); + ell_point_init(P, E, n); + mpres_set(P->x, x, n); + mpres_set(P->y, y, n); + mpres_set_ui(P->z, 1, n); + ell_point_init(Q, E, n); + mpz_init_set_ui(e, 3); + if(ell_point_mul(f, Q, e, P, E, n) != 0){ + mpres_set(x, Q->x, n); + mpres_set(y, Q->y, n); + } + mpz_clear(e); + ell_point_clear(Q, E, n); + ell_point_clear(P, E, n); + ell_curve_clear(E, n); + return ret; +} + +/******************** projective twisted Hessian form ********************/ + +/* a*U^3+V^3+W^3 = d*U*V*W + O_E = [0:-1:1] + -[U:V:W]=[U:W:V] +*/ +int +twisted_hessian_is_zero(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + mpres_t tmp; + int ret; + + if(mpz_sgn(P->x) != 0) + return 0; + mpres_init(tmp, n); + mpres_add(tmp, P->y, P->z, n); + ret = mpz_sgn(tmp) == 0; +#if 0 + if(ret) + gmp_printf("found a third root of unity? %Zd/%Zd\n", P->x, P->y); +#endif + mpres_clear(tmp, n); + return ret; +} + +void +twisted_hessian_set_to_zero(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + mpres_set_si(P->x, 0, n); + mpres_set_si(P->y, -1, n); + mpres_set_si(P->z, 1, n); +} + +#if DEBUG_ADD_LAWS >= 1 +void +twisted_hessian_print(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + pt_w_print(P->x, P->y, P->z, E, n); +} +#endif + +#if USE_ADD_SUB_CHAINS > 0 +/* -[u:v:w] = [u:w:v] */ +void +twisted_hessian_negate(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, ATTRIBUTE_UNUSED mpmod_t n) +{ + mpz_swap(P->y, P->z); /* humf */ +} +#endif + +/* TODO: decrease the number of buffers? */ +/* 6M+2S+1M_d: better when d is small */ +int +twisted_hessian_duplicate(ell_point_t R, ell_point_t P, + ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + /* R = buf[0], ..., W = buf[5], C = buf[6], D = buf[7], E = buf[8] */ + /* R:=Y1+Z1;*/ + mpres_add(E->buf[0], P->y, P->z, n); + /* S:=Y1-Z1;*/ + mpres_sub(E->buf[1], P->y, P->z, n); + /* T:=R^2 mod N;*/ + mpres_sqr(E->buf[2], E->buf[0], n); + /* U:=S^2 mod N;*/ + mpres_sqr(E->buf[3], E->buf[1], n); + /* V:=T+3*U;*/ + mpres_add(E->buf[4], E->buf[2], E->buf[3], n); + mpres_add(E->buf[4], E->buf[4], E->buf[3], n); + mpres_add(E->buf[4], E->buf[4], E->buf[3], n); + /* W:=3*T+U;*/ + mpres_add(E->buf[5], E->buf[3], E->buf[2], n); + mpres_add(E->buf[5], E->buf[5], E->buf[2], n); + mpres_add(E->buf[5], E->buf[5], E->buf[2], n); + /* C:=(R*V) mod N;*/ + mpres_mul(E->buf[6], E->buf[0], E->buf[4], n); + /* D:=(S*W) mod N;*/ + mpres_mul(E->buf[7], E->buf[1], E->buf[5], n); + /* E:=(3*C-E0[2]*X1*(W-V)) mod N;*/ + mpres_sub(E->buf[8], E->buf[5], E->buf[4], n); + mpres_mul(E->buf[8], E->buf[8], P->x, n); + mpres_mul(E->buf[8], E->buf[8], E->a6, n); + mpres_sub(E->buf[8], E->buf[6], E->buf[8], n); + mpres_add(E->buf[8], E->buf[8], E->buf[6], n); + mpres_add(E->buf[8], E->buf[8], E->buf[6], n); + /* X3:=(-2*X1*D) mod N;*/ + mpres_mul(R->x, P->x, E->buf[7], n); + mpres_add(R->x, R->x, R->x, n); + mpres_neg(R->x, R->x, n); + /* Y3:=((D+E)*Z1) mod N;*/ + mpres_add(E->buf[0], E->buf[7], E->buf[8], n); + mpres_mul(E->buf[1], E->buf[0], P->z, n); + /* Z3:=((D-E)*Y1) mod N;*/ + mpres_sub(E->buf[0], E->buf[7], E->buf[8], n); + mpres_mul(R->z, E->buf[0], P->y, n); + mpres_set(R->y, E->buf[1], n); + return 1; +} + +/* TODO: reduce the number of buffers? */ +int +twisted_hessian_plus(ell_point_t R, ell_point_t P, ell_point_t Q, + ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + /* A = buf[0], ... F = buf[5], G = [6], H = [7], J = [8] */ + // A:=X1*Z2 mod N; + mpres_mul(E->buf[0], P->x, Q->z, n); + // B:=Z1*Z2 mod N; + mpres_mul(E->buf[1], P->z, Q->z, n); + // C:=Y1*X2 mod N; + mpres_mul(E->buf[2], P->y, Q->x, n); + // D:=Y1*Y2 mod N; + mpres_mul(E->buf[3], P->y, Q->y, n); + // E:=Z1*Y2 mod N; + mpres_mul(E->buf[4], P->z, Q->y, n); + // F:=E0[1]*X1*X2 mod N; + mpres_mul(E->buf[5], P->x, Q->x, n); + mpres_mul(E->buf[5], E->buf[5], E->a4, n); + // Hisil + // G := (D+B)*(A-C) mod N; + mpres_add(E->buf[9], E->buf[3], E->buf[1], n); + mpres_sub(E->buf[6], E->buf[0], E->buf[2], n); + mpres_mul(E->buf[6], E->buf[6], E->buf[9], n); + // H := (D-B)*(A+C) mod N; + mpres_sub(E->buf[9], E->buf[3], E->buf[1], n); + mpres_add(E->buf[7], E->buf[0], E->buf[2], n); + mpres_mul(E->buf[7], E->buf[7], E->buf[9], n); + // J := (D+F)*(A-E) mod N; + mpres_add(E->buf[9], E->buf[3], E->buf[5], n); + mpres_sub(E->buf[8], E->buf[0], E->buf[4], n); + mpres_mul(E->buf[8], E->buf[8], E->buf[9], n); + // K := (D-F)*(A+E) mod N; + // this is the last use of A, so that K -> buf[0] + mpres_sub(E->buf[9], E->buf[3], E->buf[5], n); + mpres_add(E->buf[0], E->buf[0], E->buf[4], n); + mpres_mul(E->buf[0], E->buf[0], E->buf[9], n); + // X3 := G-H + mpres_sub(R->x, E->buf[6], E->buf[7], n); + // Y3 := K-J + mpres_sub(R->y, E->buf[0], E->buf[8], n); + // Z3 := (J+K-G-H-2*(B-F)*(C+E)) mod N; + mpres_sub(E->buf[9], E->buf[1], E->buf[5], n); + mpres_add(R->z, E->buf[2], E->buf[4], n); + mpres_mul(R->z, R->z, E->buf[9], n); + mpres_add(R->z, R->z, R->z, n); + mpres_add(R->z, R->z, E->buf[7], n); + mpres_add(R->z, R->z, E->buf[6], n); + mpres_sub(R->z, E->buf[0], R->z, n); + mpres_add(R->z, R->z, E->buf[8], n); + if(mpz_sgn(R->x) == 0 && mpz_sgn(R->y) == 0 && mpz_sgn(R->z) == 0){ + // iff (X2:Y2:Z2)=(Z1:gamma^2*X1:gamma*Y1), gamma^3 = a + fprintf(stderr, "GASP: X3, Y3 and Z3 are 0\n"); + exit(-1); +#if 0 + // TODO: rewrite with above quantities! + X3:=(X1^2*Y2*Z2-X2^2*Y1*Z1) mod N; + // A*X1*Y2-C*X2*Z1 = A*U-C*V + Y3:=(Z1^2*X2*Y2-Z2^2*X1*Y1) mod N; + // E*Z1*X2-A*Z2*Y1 = E*V-A*W + Z3:=(Y1^2*X2*Z2-Y2^2*X1*Z1) mod N; + // C*Y1*Z2-E*Y2*X1 = C*W-E*U + + // X3 = Y1*(a*X1^3-Z1^3) + // Y3 = g^2*X1*(Z1^3-Y1^3) + // Z3 = g*Z1*(Y1^3-Z1^3) + + // can be made faster with a = aa^3, since then g = aa and we + // can share many things + +#endif + } + return 1; +} + +int +twisted_hessian_add(ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + if(twisted_hessian_is_zero(P, E, n)){ + ell_point_set(R, Q, E, n); + return 1; + } + else if(twisted_hessian_is_zero(Q, E, n)){ + ell_point_set(R, P, E, n); + return 1; + } + else + return twisted_hessian_plus(R, P, Q, E, n); +} + +#if USE_ADD_SUB_CHAINS > 0 +int +twisted_hessian_sub(ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + int ret; + + twisted_hessian_negate(Q, E, n); + ret = twisted_hessian_add(R, P, Q, E, n); + twisted_hessian_negate(Q, E, n); + return ret; +} +#endif + +/* INPUT: a*x^3+y^3+1 = d*x*y + OUTPUT: Y^2 = X^3+A*X+B + If a=c^3, then curve isom to Hessian (c*x)^3+y^3+1=3*(d/(3*c))*(c*x)*y + SIDE EFFECT: (x, y, c) <- (x_on_W, y_on_W, A_of_W) + */ +int +twisted_hessian_to_weierstrass(mpz_t f, mpres_t x, mpres_t y, mpres_t c, mpres_t d, mpmod_t n) +{ + int ret = ECM_NO_FACTOR_FOUND; + mpres_t tmp; + +#if DEBUG_ADD_LAWS >= 2 + printf("x_tH="); print_mpz_from_mpres(x, n); printf("\n"); + printf("y_tH="); print_mpz_from_mpres(y, n); printf("\n"); + printf("c_tH="); print_mpz_from_mpres(c, n); printf("\n"); + printf("d_tH="); print_mpz_from_mpres(d, n); printf("\n"); +#endif + mpres_init(tmp, n); + mpres_mul_ui(tmp, c, 3, n); + if(mpres_invert(tmp, tmp, n) == 0){ + mpres_gcd(f, tmp, n); + ret = ECM_FACTOR_FOUND_STEP1; + } + else{ + mpres_mul(x, x, c, n); + mpres_mul(c, tmp, d, n); + /* from x^3+y^3+1=3*c*x*y to Weierstrass stuff */ + ret = hessian_to_weierstrass(f, x, y, c, n); +#if DEBUG_ADD_LAWS >= 2 + printf("A_W="); print_mpz_from_mpres(c, n); printf("\n"); + printf("x_W="); print_mpz_from_mpres(x, n); printf("\n"); + printf("y_W="); print_mpz_from_mpres(y, n); printf("\n"); +#endif + } + mpres_clear(tmp, n); + return ret; +} + +/******************** generic ec's ********************/ + +void +ell_point_init(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + mpres_init(P->x, n); + mpres_init(P->y, n); + mpres_init(P->z, n); + if(E->type == ECM_EC_TYPE_WEIERSTRASS){ + if(E->law == ECM_LAW_AFFINE) + mpz_set_ui(P->z, 1); /* humf */ + else if(E->law == ECM_LAW_HOMOGENEOUS) + mpres_set_ui(P->z, 1, n); + } + else if(E->type == ECM_EC_TYPE_HESSIAN + || E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + mpres_set_ui(P->z, 1, n); +} + +/* TODO: change this according to E->type */ +void +ell_point_clear(ell_point_t P, ATTRIBUTE_UNUSED ell_curve_t E, mpmod_t n) +{ + mpres_clear(P->x, n); + mpres_clear(P->y, n); + mpres_clear(P->z, n); +} + +#if DEBUG_ADD_LAWS >= 1 +void +ell_point_print(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + if(E->type == ECM_EC_TYPE_WEIERSTRASS) + pt_w_print(P->x, P->y, P->z, E, n); + else if(E->type == ECM_EC_TYPE_HESSIAN) + hessian_print(P, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + twisted_hessian_print(P, E, n); +} +#endif + +/* TODO: should depend on E->type... */ +void +ell_point_set(ell_point_t Q, ell_point_t P, + ATTRIBUTE_UNUSED ell_curve_t E, ATTRIBUTE_UNUSED mpmod_t n) +{ + mpres_set(Q->x, P->x, n); + mpres_set(Q->y, P->y, n); + mpres_set(Q->z, P->z, n); +} + +void +ell_curve_init(ell_curve_t E, int etype, int law, mpmod_t n) +{ + int i; + + E->type = etype; + E->law = law; + mpres_init(E->a1, n); + mpres_init(E->a3, n); + mpres_init(E->a2, n); + mpres_init(E->a4, n); + mpres_init(E->a6, n); + mpres_set_ui(E->a1, 0, n); + mpres_set_ui(E->a3, 0, n); + mpres_set_ui(E->a2, 0, n); + mpres_set_ui(E->a4, 0, n); + mpres_set_ui(E->a6, 0, n); + for(i = 0; i < EC_W_NBUFS; i++) + mpres_init (E->buf[i], n); +} + +void +ell_curve_init_set(ell_curve_t E, int etype, int law, mpres_t A, mpmod_t n) +{ + ell_curve_init(E, etype, law, n); + mpres_set(E->a4, A, n); +} + +void +ell_curve_set_z(ell_curve_t E, ell_curve_t zE, mpmod_t n) +{ + ell_curve_init(E, zE->type, zE->law, n); + mpres_set_z(E->a1, zE->a1, n); + mpres_set_z(E->a3, zE->a3, n); + mpres_set_z(E->a2, zE->a2, n); + mpres_set_z(E->a4, zE->a4, n); + mpres_set_z(E->a6, zE->a6, n); +#if 0 + E->disc = zE->disc; + if(E->disc != 0){ + mpres_init(E->sq[0], n); + mpres_set_z(E->sq[0], zE->sq[0], n); + } +#endif +} + +void +ell_curve_clear(ell_curve_t E, mpmod_t n) +{ + int i; + + mpres_clear(E->a4, n); + for(i = 0; i < EC_W_NBUFS; i++) + mpres_clear (E->buf[i], n); + /* TODO: case of sq */ +} + +#if DEBUG_ADD_LAWS >= 1 +void +ell_curve_print(ell_curve_t E, mpmod_t n) +{ + if(E->type == ECM_EC_TYPE_WEIERSTRASS){ + printf("["); print_mpz_from_mpres(E->a1, n); + printf(", "); print_mpz_from_mpres(E->a3, n); + printf(", "); print_mpz_from_mpres(E->a2, n); + printf(", "); print_mpz_from_mpres(E->a4, n); + printf(", "); print_mpz_from_mpres(E->a6, n); printf("];\n"); + } + else if(E->type == ECM_EC_TYPE_HESSIAN){ + printf("D:="); print_mpz_from_mpres(E->a4, n); printf(";\n"); + printf("E:=[D];\n"); + } + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN){ + printf("a:="); print_mpz_from_mpres(E->a4, n); printf(";\n"); + printf("d:="); print_mpz_from_mpres(E->a6, n); printf(";\n"); + printf("E:=[a, d];\n"); + } +} +#endif + +/* OUTPUT: 1 if P = O_E, 0 otherwise. */ +int +ell_point_is_zero(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + if(E->type == ECM_EC_TYPE_WEIERSTRASS) + return pt_w_is_zero(P->z, n); + else if(E->type == ECM_EC_TYPE_HESSIAN) + return hessian_is_zero(P, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + return twisted_hessian_is_zero(P, E, n); + return ECM_ERROR; +} + +void +ell_point_set_to_zero(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + if(E->type == ECM_EC_TYPE_WEIERSTRASS) + pt_w_set_to_zero(P, n); + else if(E->type == ECM_EC_TYPE_HESSIAN) + hessian_set_to_zero(P, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + twisted_hessian_set_to_zero(P, E, n); +} + +int +ell_point_is_on_curve(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + int ok = 1; + + if(ell_point_is_zero(P, E, n)) + return 1; + if(E->type == ECM_EC_TYPE_WEIERSTRASS){ + mpres_t tmp1, tmp2; + + mpres_init(tmp1, n); + mpres_init(tmp2, n); + if(E->law == ECM_LAW_AFFINE){ + /* y^2+a1*x*y+a3*y = x^3+a2*x^2+a4*x+a6? */ + mpres_mul(tmp1, E->a1, P->x, n); + mpres_add(tmp1, tmp1, P->y, n); + mpres_add(tmp1, tmp1, E->a3, n); + mpres_mul(tmp1, tmp1, P->y, n); + + mpres_add(tmp2, E->a2, P->x, n); + mpres_mul(tmp2, tmp2, P->x, n); + mpres_add(tmp2, tmp2, E->a4, n); + mpres_mul(tmp2, tmp2, P->x, n); + mpres_add(tmp2, tmp2, E->a6, n); + } +#if 0 // useless for the time being + else{ + /* y^2*z+a1*x*y*z+a3*y*z^2 = x^3+a2*x^2*z+a4*x*z^2+a6*z^3? */ + /* y*z*(y+a1*x+a3*z) = ((x+a2*z)*x+a4*z^2)*x+a6*z^3? */ + mpres_t tmp3; + + mpres_mul(tmp1, E->a1, P->x, n); /* a1*x */ + mpres_add(tmp1, tmp1, P->y, n); /* a1*x+y */ + mpres_mul(tmp2, E->a3, P->z, n); /* a3*z */ + mpres_add(tmp1, tmp1, tmp2, n); /* y+a1*x+a3*z */ + mpres_mul(tmp1, tmp1, P->y, n); /* y*(...) */ + mpres_mul(tmp1, tmp1, P->z, n); /* lhs */ + + mpres_init(tmp3, n); + mpres_mul(tmp2, E->a2, P->z, n); /* a2*z */ + mpres_add(tmp2, tmp2, P->x, n); /* x+a2*z */ + mpres_mul(tmp2, tmp2, P->x, n); /* (x+a2*z)*x */ + mpres_mul(tmp3, E->a4, P->z, n); /* a4*z */ + mpres_mul(tmp3, tmp3, P->z, n); /* a4*z^2 */ + mpres_add(tmp2, tmp2, tmp3, n); /* (x+a2*z)*x+a4*z^2 */ + mpres_mul(tmp2, tmp2, P->x, n); /* (...)*x */ + mpres_mul(tmp3, P->z, P->z, n); /* z^2 */ + mpres_mul(tmp3, tmp3, P->z, n); /* z^3 */ + mpres_mul(tmp3, tmp3, E->a6, n); /* a6*z^3 */ + mpres_add(tmp2, tmp2, tmp3, n); /* rhs */ + mpres_clear(tmp3, n); + } +#endif + ok = mpres_equal(tmp1, tmp2, n); + + mpres_clear(tmp1, n); + mpres_clear(tmp2, n); + } + else if(E->type == ECM_EC_TYPE_HESSIAN){ + /* TODO */ + } + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN){ + /* TODO */ + } + return ok; +} + +#if DEBUG_ADD_LAWS >= 1 +static void +ell_point_check(ell_point_t P, ell_curve_t E, mpmod_t n) +{ + if(ell_point_is_on_curve(P, E, n) == 0){ + printf("Point not on curve\n"); + printf("E:="); + ell_curve_print(E, n); + printf("P:="); + pt_print(E, P, n); + printf("\n"); + exit(-1); + } +} +#endif + +#if DEBUG_ADD_LAWS >= 1 +int +ell_point_equal(ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + int ret = 1; + + if(E->type == ECM_EC_TYPE_WEIERSTRASS){ + if(E->law == ECM_LAW_AFFINE) + return mpres_equal(P->x, Q->x, n) + && mpres_equal(P->y, Q->y, n) + && mpres_equal(P->z, Q->z, n); + else if(E->law == ECM_LAW_HOMOGENEOUS){ + mpres_t tmp1, tmp2; + + mpres_init(tmp1, n); + mpres_init(tmp2, n); + mpres_mul(tmp1, P->x, Q->z, n); + mpres_mul(tmp2, P->z, Q->x, n); + if(mpres_equal(tmp1, tmp2, n) == 0){ + printf("Px/Pz != Qx/Qz\n"); + ret = 0; + exit(-1); + } + else{ + mpres_mul(tmp1, P->y, Q->z, n); + mpres_mul(tmp2, P->z, Q->y, n); + if(mpres_equal(tmp1, tmp2, n) == 0){ + printf("Py/Pz != Qy/Qz\n"); + ret = 0; + exit(-1); + } + } + mpres_clear(tmp1, n); + mpres_clear(tmp2, n); + } + } + return ret; +} +#endif + +/* OUTPUT: 1 if everything ok, 0 otherwise */ +int +ell_point_add(mpz_t f, ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + if(E->type == ECM_EC_TYPE_WEIERSTRASS) + return pt_w_add(f, R->x, R->y, R->z, P->x, P->y, P->z, + Q->x, Q->y, Q->z, n, E); + else if(E->type == ECM_EC_TYPE_HESSIAN) + return hessian_add(R, P, Q, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + return twisted_hessian_add(R, P, Q, E, n); + else + return ECM_ERROR; +} + +#if USE_ADD_SUB_CHAINS > 0 +/* R <- P-Q */ +int +ell_point_sub(mpz_t f, ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n) +{ + if(E->type == ECM_EC_TYPE_WEIERSTRASS) + return pt_w_sub(f, R->x, R->y, R->z, P->x, P->y, P->z, + Q->x, Q->y, Q->z, n, E); + else if(E->type == ECM_EC_TYPE_HESSIAN) + return hessian_sub(R, P, Q, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + return twisted_hessian_sub(R, P, Q, E, n); + else + return ECM_ERROR; +} +#endif + +int +ell_point_duplicate(mpz_t f, ell_point_t R, ell_point_t P, ell_curve_t E, mpmod_t n) +{ +#if DEBUG_ADD_LAWS >= 2 + printf("E:="); + ell_curve_print(E, n); +#endif + if(E->type == ECM_EC_TYPE_WEIERSTRASS) + return pt_w_duplicate(f, R->x, R->y, R->z, P->x, P->y, P->z, n, E); + else if(E->type == ECM_EC_TYPE_HESSIAN) + return hessian_duplicate(R, P, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + return twisted_hessian_duplicate(R, P, E, n); + else + return ECM_ERROR; +} + +void +ell_point_negate(ell_point_t P, ell_curve_t E, mpmod_t n) +{ +#if DEBUG_ADD_LAWS >= 2 + printf("P:="); ell_point_print(P, E, n); printf(";\n"); +#endif + if(ell_point_is_zero(P, E, n) == 0){ + if(E->type == ECM_EC_TYPE_WEIERSTRASS){ + if(E->law == ECM_LAW_HOMOGENEOUS){ + /* FIXME: does not work for complete equation! */ + mpres_neg(P->y, P->y, n); + } + else if(E->law == ECM_LAW_AFFINE){ + /* (-P).y = -P.y-a1*P.x-a3 */ + if(mpz_sgn(E->a1) != 0 + || mpz_sgn(E->a3) != 0 + || mpz_sgn(E->a2) != 0){ /* FIXME */ + printf("GROUMF\n"); + exit(-1); + } + mpres_neg(P->y, P->y, n); + } + } +#if USE_ADD_SUB_CHAINS > 0 + else if(E->type == ECM_EC_TYPE_HESSIAN) + hessian_negate(P, E, n); + else if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + twisted_hessian_negate(P, E, n); +#endif + } +#if DEBUG_ADD_LAWS >= 2 + printf("neg(P):="); ell_point_print(P, E, n); printf(";\n"); +#endif +} + +/* Q <- [e]*P + Return value: 0 if a factor is found, and the factor is in Q->x, + 1 otherwise. +*/ +int +ell_point_mul_plain (mpz_t f, ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n) +{ + size_t l; + int negated = 0, status = 1; + ell_point_t P0; + + if(ell_point_is_zero(P, E, n) != 0){ + ell_point_set(Q, P, E, n); + return 1; + } + + if (mpz_sgn (e) == 0) + { + ell_point_set_to_zero(Q, E, n); + return 1; + } + + if (mpz_sgn (e) < 0) + { + negated = 1; + mpz_neg (e, e); + ell_point_negate(P, E, n); /* since the point is non-zero */ + } + + if (mpz_cmp_ui (e, 1) == 0){ + ell_point_set(Q, P, E, n); + goto ell_point_mul_plain_end; + } + + l = mpz_sizeinbase (e, 2) - 1; /* l >= 1 */ + + ell_point_init(P0, E, n); + ell_point_set(P0, P, E, n); + +#if DEBUG_ADD_LAWS >= 2 + printf("P:="); ell_point_print(P, E, n); printf(";\n"); +#endif + while (l-- > 0) + { +#if DEBUG_ADD_LAWS >= 2 + printf("P0:="); ell_point_print(P0, E, n); printf(";\n"); +#endif + if(ell_point_duplicate (f, P0, P0, E, n) == 0) + { + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Rdup:="); ell_point_print(P0, E, n); printf(";\n"); + printf("dup:=ProjEcmDouble(P0, E, N); ProjEcmEqual(dup, Rdup, N);\n"); +#endif + if (mpz_tstbit (e, l)) + { + if(ell_point_add (f, P0, P0, P, E, n) == 0) + { + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Radd:="); ell_point_print(P0, E, n); printf(";\n"); + printf("Padd:=ProjEcmAdd(P, Rdup, E, N); ProjEcmEqual(Padd, Radd, N);\n"); +#endif + } + } + + ell_point_set(Q, P0, E, n); + ell_point_clear(P0, E, n); +ell_point_mul_plain_end: + + /* Undo negation to avoid changing the caller's e value */ + if (negated){ + mpz_neg (e, e); + ell_point_negate(P, E, n); + } + return status; +} + +int +ell_point_mul(mpz_t f, ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n) +{ +#if 1 /* keeping it simple */ + return ell_point_mul_plain(f, Q, e, P, E, n); +#else + return ell_point_mul_add_sub(f, Q, e, P, E, n); +#endif +} + diff -Nru gmp-ecm-7.0.4+ds/addlaws.h gmp-ecm-7.0.5+ds/addlaws.h --- gmp-ecm-7.0.4+ds/addlaws.h 2016-03-15 09:59:19.000000000 +0000 +++ gmp-ecm-7.0.5+ds/addlaws.h 2022-06-06 14:16:49.000000000 +0000 @@ -1,4 +1,5 @@ -#define USE_ADD_SUB_CHAINS 1 +#define DEBUG_ADD_LAWS 0 +#define USE_ADD_SUB_CHAINS 0 #define pt_is_equal(P, Q) (mpz_cmp((P)->x, (Q)->x) == 0 \ && mpz_cmp((P)->y, (Q)->y) == 0 \ @@ -9,20 +10,13 @@ void pt_set_to_zero(ell_point_t P, mpmod_t n); void pt_assign(ell_point_t Q, ell_point_t P, ATTRIBUTE_UNUSED mpmod_t n); void pt_neg(ell_point_t P, mpmod_t n); -void pt_many_set_to_zero(ell_point_t *tP, int nE, mpmod_t n); -void pt_many_neg(ell_point_t *tP, int nE, mpmod_t n); -void pt_many_assign(ell_point_t *tQ, ell_point_t *tP, int nE, mpmod_t n); -void pt_many_print(ell_curve_t *tE, ell_point_t *tP, int nE, mpmod_t n); -void print_mpz_from_mpres(mpres_t x, mpmod_t n); -int pt_many_duplicate(ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, int nE, mpmod_t n, mpres_t *num, mpres_t *den, mpres_t *inv, char *ok); -int pt_many_mul(ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, int nE, - mpz_t e, mpmod_t n, - mpres_t *num, mpres_t *den, mpres_t *inv, char *ok); int hessian_to_weierstrass(mpz_t f, mpres_t x, mpres_t y, mpres_t D, mpmod_t n); +int +twisted_hessian_to_weierstrass(mpz_t f, mpres_t x, mpres_t y, mpres_t c, mpres_t d, mpmod_t n); -int build_MO_chain(short *S, size_t Slen, mpz_t e, int w); -int build_add_sub_chain(short *S, size_t Slen, mpz_t e, int w); +size_t build_MO_chain(short *S, size_t Slen, mpz_t e, int w); +size_t build_add_sub_chain(short *S, size_t Slen, mpz_t e, int w); int compute_s_4_add_sub(mpz_t s, ecm_uint B1, int disc); int mult_by_3(mpz_t f, mpres_t x, mpres_t y, mpres_t A, mpmod_t n); @@ -39,16 +33,15 @@ int ell_point_is_on_curve(ell_point_t P, ell_curve_t E, mpmod_t n); int ell_point_is_zero(ell_point_t P, ell_curve_t E, mpmod_t n); void ell_point_set_to_zero(ell_point_t P, ell_curve_t E, mpmod_t n); -int ell_point_add(ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n); -int ell_point_sub(ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n); -int ell_point_duplicate(ell_point_t R, ell_point_t P, ell_curve_t E, mpmod_t n); +int ell_point_add(mpz_t f, ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n); +int ell_point_sub(mpz_t f, ell_point_t R, ell_point_t P, ell_point_t Q, ell_curve_t E, mpmod_t n); +int ell_point_duplicate(mpz_t f, ell_point_t R, ell_point_t P, ell_curve_t E, mpmod_t n); void ell_point_negate(ell_point_t P, ell_curve_t E, mpmod_t n); -int ell_point_mul_plain (ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n); +int ell_point_mul_plain (mpz_t f, ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n); int get_add_sub_w(mpz_t e); -void add_sub_pack(mpz_t s, int w, short *S, int iS); -void add_sub_unpack(int *w, short **S, int *iS, mpz_t s); -int ell_point_mul_add_sub_with_S(ell_point_t Q, ell_point_t P, ell_curve_t E, - mpmod_t n, int w, short *S, int iS); -int ell_point_mul_add_sub (ell_point_t Q, mpz_t e, ell_point_t P, +void add_sub_pack(mpz_t s, int w, short *S, size_t iS); +void add_sub_unpack(int *w, short **S, size_t *iS, mpz_t s); +int ell_point_mul_add_sub_with_S(mpz_t f, ell_point_t Q, ell_point_t P, ell_curve_t E,mpmod_t n, int w, short *S, int iS); +int ell_point_mul_add_sub (mpz_t f, ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n); -int ell_point_mul(ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n); +int ell_point_mul(mpz_t f, ell_point_t Q, mpz_t e, ell_point_t P, ell_curve_t E, mpmod_t n); diff -Nru gmp-ecm-7.0.4+ds/aprtcle/mpz_aprcl.c gmp-ecm-7.0.5+ds/aprtcle/mpz_aprcl.c --- gmp-ecm-7.0.4+ds/aprtcle/mpz_aprcl.c 2015-03-16 07:01:51.000000000 +0000 +++ gmp-ecm-7.0.5+ds/aprtcle/mpz_aprcl.c 2022-06-06 14:16:49.000000000 +0000 @@ -609,7 +609,7 @@ if (verbose >= APRTCLE_VERBOSE1) { - printf("P = %2d, Q = %12d (%3.2f%%)\r", P, Q, (i * (TestingQs + 1) + j) * 100.0 / (NP * (TestingQs + 1))); + printf("APR primality test: P = %2d, Q = %12d (%3.2f%%)\r", P, Q, (i * (TestingQs + 1) + j) * 100.0 / (NP * (TestingQs + 1))); fflush(stdout); } diff -Nru gmp-ecm-7.0.4+ds/athlon/autogen.py gmp-ecm-7.0.5+ds/athlon/autogen.py --- gmp-ecm-7.0.4+ds/athlon/autogen.py 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/autogen.py 2022-06-06 14:16:49.000000000 +0000 @@ -3,6 +3,13 @@ import re import sys +# Final assembler statement to mark stack as not executable on linux elf platforms +# Single quotes are used around # to prevent M4 to discard them as comments. M4 will remove them. +noexecstack_statement = """ +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +""" def offaddr(addr, offset): if offset == 0: @@ -199,7 +206,7 @@ k = int(sys.argv[1]) if k == 1: - print """# + print("""# # mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # @@ -242,7 +249,7 @@ movl %edx, (%ecx) adcl $0, %eax ret -""" +""" + noexecstack_statement) else: - print mulredc_k_rolled(k) + print(mulredc_k_rolled(k) + noexecstack_statement) diff -Nru gmp-ecm-7.0.4+ds/athlon/generate_all gmp-ecm-7.0.5+ds/athlon/generate_all --- gmp-ecm-7.0.4+ds/athlon/generate_all 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/generate_all 2022-06-06 14:16:49.000000000 +0000 @@ -1,6 +1,6 @@ #!/bin/sh -for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do +for i in {1..20}; do ./autogen.py $i > mulredc$i.asm done diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc10.asm gmp-ecm-7.0.5+ds/athlon/mulredc10.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc10.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc10.asm 2022-06-06 14:16:49.000000000 +0000 @@ -254,3 +254,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc11.asm gmp-ecm-7.0.5+ds/athlon/mulredc11.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc11.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc11.asm 2022-06-06 14:16:49.000000000 +0000 @@ -272,3 +272,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc12.asm gmp-ecm-7.0.5+ds/athlon/mulredc12.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc12.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc12.asm 2022-06-06 14:16:49.000000000 +0000 @@ -290,3 +290,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc13.asm gmp-ecm-7.0.5+ds/athlon/mulredc13.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc13.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc13.asm 2022-06-06 14:16:49.000000000 +0000 @@ -308,3 +308,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc14.asm gmp-ecm-7.0.5+ds/athlon/mulredc14.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc14.asm 2006-03-07 15:57:35.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc14.asm 2022-06-06 14:16:49.000000000 +0000 @@ -326,3 +326,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc15.asm gmp-ecm-7.0.5+ds/athlon/mulredc15.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc15.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc15.asm 2022-06-06 14:16:49.000000000 +0000 @@ -344,3 +344,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc16.asm gmp-ecm-7.0.5+ds/athlon/mulredc16.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc16.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc16.asm 2022-06-06 14:16:49.000000000 +0000 @@ -362,3 +362,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc17.asm gmp-ecm-7.0.5+ds/athlon/mulredc17.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc17.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc17.asm 2022-06-06 14:16:49.000000000 +0000 @@ -380,3 +380,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc18.asm gmp-ecm-7.0.5+ds/athlon/mulredc18.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc18.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc18.asm 2022-06-06 14:16:49.000000000 +0000 @@ -398,3 +398,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc19.asm gmp-ecm-7.0.5+ds/athlon/mulredc19.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc19.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc19.asm 2022-06-06 14:16:49.000000000 +0000 @@ -416,3 +416,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc1.asm gmp-ecm-7.0.5+ds/athlon/mulredc1.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc1.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc1.asm 2022-06-06 14:16:49.000000000 +0000 @@ -42,3 +42,7 @@ adcl $0, %eax ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc20.asm gmp-ecm-7.0.5+ds/athlon/mulredc20.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc20.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc20.asm 2022-06-06 14:16:49.000000000 +0000 @@ -434,3 +434,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc2.asm gmp-ecm-7.0.5+ds/athlon/mulredc2.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc2.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc2.asm 2022-06-06 14:16:49.000000000 +0000 @@ -110,3 +110,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc3.asm gmp-ecm-7.0.5+ds/athlon/mulredc3.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc3.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc3.asm 2022-06-06 14:16:49.000000000 +0000 @@ -128,3 +128,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc4.asm gmp-ecm-7.0.5+ds/athlon/mulredc4.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc4.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc4.asm 2022-06-06 14:16:49.000000000 +0000 @@ -146,3 +146,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc5.asm gmp-ecm-7.0.5+ds/athlon/mulredc5.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc5.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc5.asm 2022-06-06 14:16:49.000000000 +0000 @@ -164,3 +164,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc6.asm gmp-ecm-7.0.5+ds/athlon/mulredc6.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc6.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc6.asm 2022-06-06 14:16:49.000000000 +0000 @@ -182,3 +182,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc7.asm gmp-ecm-7.0.5+ds/athlon/mulredc7.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc7.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc7.asm 2022-06-06 14:16:49.000000000 +0000 @@ -200,3 +200,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc8.asm gmp-ecm-7.0.5+ds/athlon/mulredc8.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc8.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc8.asm 2022-06-06 14:16:49.000000000 +0000 @@ -218,3 +218,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/athlon/mulredc9.asm gmp-ecm-7.0.5+ds/athlon/mulredc9.asm --- gmp-ecm-7.0.4+ds/athlon/mulredc9.asm 2006-03-07 15:57:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/athlon/mulredc9.asm 2022-06-06 14:16:49.000000000 +0000 @@ -236,3 +236,7 @@ popl %ebp ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/AUTHORS gmp-ecm-7.0.5+ds/AUTHORS --- gmp-ecm-7.0.4+ds/AUTHORS 2014-04-08 07:01:43.000000000 +0000 +++ gmp-ecm-7.0.5+ds/AUTHORS 2022-06-06 14:16:49.000000000 +0000 @@ -26,6 +26,8 @@ and the NTT code. Jason S. Papadopoulos contributed optimizations to the NTT code. +Seth Troisi + contributed to GPU code Paul Zimmermann author of the first version of the program. @@ -34,4 +36,4 @@ Japke Rosink, Bruce Dodson. If you want to contribute to GMP-ECM, you are welcome; the development -version is available on . +version is available on . diff -Nru gmp-ecm-7.0.4+ds/auxlib.c gmp-ecm-7.0.5+ds/auxlib.c --- gmp-ecm-7.0.4+ds/auxlib.c 2015-02-25 14:50:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/auxlib.c 2022-06-06 14:16:49.000000000 +0000 @@ -283,3 +283,35 @@ return fseek (f, (long) offset, whence); } #endif + +int +ecm_tstbit (mpz_srcptr u, ecm_uint bit_index) +{ + mp_srcptr u_ptr = PTR(u); + ecm_int size = SIZ(u); + ecm_uint abs_size = ABS(size); + ecm_uint limb_index = bit_index / GMP_NUMB_BITS; + mp_srcptr p = u_ptr + limb_index; + mp_limb_t limb; + + if (limb_index >= abs_size) + return (size < 0); + + limb = *p; + if (size < 0) + { + limb = -limb; /* twos complement */ + + while (p != u_ptr) + { + p--; + if (*p != 0) + { + limb--; /* make it a ones complement instead */ + break; + } + } + } + + return (limb >> (bit_index % GMP_NUMB_BITS)) & 1; +} diff -Nru gmp-ecm-7.0.4+ds/batch.c gmp-ecm-7.0.5+ds/batch.c --- gmp-ecm-7.0.4+ds/batch.c 2016-04-08 12:47:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/batch.c 2022-06-06 14:16:49.000000000 +0000 @@ -41,14 +41,14 @@ is = (2^32-1). Multiplying all primes up to the following will result in a product that has (2^32-1) bits. */ #define MAX_B1_BATCH 2977044736UL -#elif defined(_WIN32) +#elif defined(_WIN32) && __GNU_MP_VERSION <= 6 && !defined(__MPIR_VERSION) /* Due to a limitation in GMP on 64-bit Windows, should also affect 32-bit Windows, sufficient memory cannot be allocated for the batch product s when using primes larger than the following */ #define MAX_B1_BATCH 3124253146UL #else -/* nth_prime(2^(MAX_HEIGHT-1)) */ -#define MAX_B1_BATCH 50685770167ULL +/* nth_prime(2^(MAX_HEIGHT-1))-1 */ +#define MAX_B1_BATCH 50685770166ULL #endif /* If forbiddenres != NULL, forbiddenres = "m r_1 ... r_k -1" indicating that @@ -67,7 +67,7 @@ prime_info_init (prime_info); - ASSERT_ALWAYS (B1 < MAX_B1_BATCH); + ASSERT_ALWAYS (B1 <= MAX_B1_BATCH); for (j = 0; j < MAX_HEIGHT; j++) mpz_init (acc[j]); /* sets acc[j] to 0 */ @@ -79,40 +79,44 @@ pp = qi = pi; maxpp = B1 / qi; #ifdef HAVE_ADDLAWS - if(forbiddenres != NULL && pi > 2){ - /* non splitting primes can occur in even powers only */ - int rp = (int)(pi % forbiddenres[0]); - for(j = 1; forbiddenres[j] >= 0; j++) - if(rp >= forbiddenres[j]) - break; - if(rp == forbiddenres[j]){ - /* printf("p=%lu is forbidden\n", pi);*/ - if(qi <= maxpp){ - /* qi <= B1/qi => qi^2 <= B1, let it go */ - qi *= qi; - } - else{ - /* qi is too large, do not increment i */ - pi = getprime_mt (prime_info); - continue; - } - } - } + if (forbiddenres != NULL && pi > 2) + { + /* non splitting primes can occur in even powers only */ + int rp = (int)(pi % forbiddenres[0]); + for (j = 1; forbiddenres[j] >= 0; j++) + if (rp >= forbiddenres[j]) + break; + if (rp == forbiddenres[j]) + { + /* printf("p=%lu is forbidden\n", pi); */ + if (qi <= maxpp) + { + /* qi <= B1/qi => qi^2 <= B1, let it go */ + qi *= qi; + } + else + { + /* qi is too large, do not increment i */ + pi = getprime_mt (prime_info); + continue; + } + } + } #endif while (pp <= maxpp) pp *= qi; #if ECM_UINT_MAX == 4294967295 - mpz_set_ui (ppz, pp); + mpz_set_ui (ppz, pp); #else - mpz_set_uint64 (ppz, pp); + mpz_set_uint64 (ppz, pp); #endif if ((i & 1) == 0) - mpz_set (acc[0], ppz); + mpz_set (acc[0], ppz); else - mpz_mul (acc[0], acc[0], ppz); - + mpz_mul (acc[0], acc[0], ppz); + j = 0; /* We have accumulated i+1 products so far. If bits 0..j of i are all set, then i+1 is a multiple of 2^(j+1). */ @@ -140,7 +144,7 @@ prime_info_clear (prime_info); /* free the prime tables */ for (i = 0; i < MAX_HEIGHT; i++) - mpz_clear (acc[i]); + mpz_clear (acc[i]); mpz_clear (ppz); } @@ -264,13 +268,13 @@ A is curve parameter in Montgomery's form: g*y^2*z = x^3 + a*x^2*z + x*z^2 n is the number to factor - B1 is the stage 1 bound + B1 is the stage 1 bound Output: If a factor is found, it is returned in x. Otherwise, x contains the x-coordinate of the point computed in stage 1 (with z coordinate normalized to 1). - B1done is set to B1 if stage 1 completed normally, - or to the largest prime processed if interrupted, but never - to a smaller value than B1done was upon function entry. + B1done is set to B1 if stage 1 completed normally, + or to the largest prime processed if interrupted, but never + to a smaller value than B1done was upon function entry. Return value: ECM_FACTOR_FOUND_STEP1 if a factor, otherwise ECM_NO_FACTOR_FOUND */ @@ -304,7 +308,7 @@ /* Compute d=(A+2)/4 from A and d'=B*d thus d' = 2^(GMP_NUMB_BITS-2)*(A+2) */ if (batch == ECM_PARAM_BATCH_SQUARE || batch == ECM_PARAM_BATCH_32BITS_D) - { + { mpres_get_z (u, A, n); mpz_add_ui (u, u, 2); mpz_mul_2exp (u, u, GMP_NUMB_BITS - 2); @@ -358,7 +362,7 @@ { for (i = mpz_sizeinbase (s, 2) - 1; i-- > 0;) { - if (mpz_tstbit (s, i) == 0) /* (j,j+1) -> (2j,2j+1) */ + if (ecm_tstbit (s, i) == 0) /* (j,j+1) -> (2j,2j+1) */ /* P2 <- P1+P2 P1 <- 2*P1 */ dup_add_batch1 (x1, z1, x2, z2, t, u, d_1, n); else /* (j,j+1) -> (2j+1,2j+2) */ @@ -371,7 +375,7 @@ mpresn_pad (d_2, n); for (i = mpz_sizeinbase (s, 2) - 1; i-- > 0;) { - if (mpz_tstbit (s, i) == 0) /* (j,j+1) -> (2j,2j+1) */ + if (ecm_tstbit (s, i) == 0) /* (j,j+1) -> (2j,2j+1) */ /* P2 <- P1+P2 P1 <- 2*P1 */ dup_add_batch2 (x1, z1, x2, z2, t, u, d_2, n); else /* (j,j+1) -> (2j+1,2j+2) */ @@ -398,10 +402,8 @@ mpz_clear (z2); mpz_clear (t); mpz_clear (u); - if (batch == 2) - { + if (batch == ECM_PARAM_BATCH_2) mpz_clear (d_2); - } return ret; } diff -Nru gmp-ecm-7.0.4+ds/build.vc12/config.h gmp-ecm-7.0.5+ds/build.vc12/config.h --- gmp-ecm-7.0.4+ds/build.vc12/config.h 2014-04-08 07:01:43.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc12/config.h 2022-06-06 14:16:49.000000000 +0000 @@ -4,7 +4,7 @@ #define VERSION_GPU "gpu_ecm-win" -#define PACKAGE_BUGREPORT "ecm-discuss@lists.gforge.inria.fr" +#define PACKAGE_BUGREPORT "ecm-discuss@inria.fr" /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP systems. This function is required for `alloca.c' support on those systems. diff -Nru gmp-ecm-7.0.4+ds/build.vc12/gen_ecm_h.bat gmp-ecm-7.0.5+ds/build.vc12/gen_ecm_h.bat --- gmp-ecm-7.0.4+ds/build.vc12/gen_ecm_h.bat 2016-10-11 09:22:42.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc12/gen_ecm_h.bat 2022-06-06 14:16:49.000000000 +0000 @@ -5,7 +5,7 @@ for /f "tokens=1,2*" %%a in (..\ecm.h.in) do ( if "%%a" EQU "#undef" ( if "%%b" EQU "ECM_VERSION" ( - echo #define ECM_VERSION "7.0.4">>tmp.h + echo #define ECM_VERSION "7.0.5">>tmp.h ) ) else echo %%a %%b %%c>>tmp.h ) diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/a_win32a_redc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/a_win32a_redc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/a_win32a_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/a_win32a_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,133 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro seq 3 + mov eax, [byte esi+4*%3] + mul ebp + add [byte edi+4*%3], %2 + adc %1, eax + mov %2, edx + adc %2, 0 +%endmacro + + text + global _ecm_redc3 + +_ecm_redc3: + push ebp + push edi + push esi + push ebx + sub esp, 16 + mov ecx, [esp+44] + mov edi, [esp+36] + mov [esp], ecx + cmp ecx, 5 + jae .3 + +.1: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + xor ebx, ebx + +.2: mov eax, [esi] + add edi, 4 + mul ebp + add esi, 4 + add eax, ebx + adc edx, 0 + add [edi-4], eax + adc edx, 0 + dec ecx + mov ebx, edx + jnz .2 + mov edi, [esp+36] + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .1 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + +.3: mov edx, ecx + dec ecx + sub edx, 2 + neg ecx + shr edx, 4 + and ecx, 15 + mov [esp+8], edx + mov edx, ecx + shl edx, 4 + neg ecx + lea edx, [edx+ecx+.6] + mov [esp+44], ecx + mov [esp+12], edx + +.4: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + mov edx, [esp+8] + mov [esp+4], edx + mov eax, [esi] + lea esi, [esi+ecx*4+4] + mul ebp + lea edi, [edi+ecx*4] + mov ebx, edx + mov edx, [esp+12] + test ecx, 1 + mov ecx, eax + cmovnz ecx, ebx + cmovnz ebx, eax + jmp edx + + align 32 +.5: add edi, 64 +.6: + +%assign i 0 +%rep 16 + %if (i & 1) + seq ecx, ebx, i + %else + seq ebx, ecx, i + %endif + %assign i i + 1 +%endrep + + dec dword [esp+4] + lea esi, [esi+64] + jns .5 + + add [edi+64], ecx + mov edi, [esp+36] + adc ebx, 0 + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .4 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + + end + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/a_win32p_mulredc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/a_win32p_mulredc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/a_win32p_mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/a_win32p_mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,148 @@ + +; Part of GMP-ECM +; +; mp_limb_t mulredc1( 1 limb +; mp_limb_t *z, +; const mp_limb_t x, +; const mp_limb_t y, +; const mp_limb_t m, +; mp_limb_t inv_m +; ) +; +; mp_limb_t mulredc( > 1 limb +; mp_limb_t *z, +; const mp_limb_t *x, +; const mp_limb_t *y, +; const mp_limb_t *m, +; mp_limb_t inv_m +; ) + +%macro mseq 1 + movd mm1, [esi+4*%1] + movd mm2, [edi+4*%1] + pmuludq mm1, mm7 + paddq mm2, mm1 + paddq mm0, mm2 + movd [edi+4*%1], mm0 + psrlq mm0, 32 +%endmacro + +%macro mulredc 1 +%assign limbs %1 +%define f_name(x) _mulredc %+ x + + global f_name(limbs) +%ifdef DLL + export f_name(limbs) +%endif + +f_name(limbs): + push ebp + push edi + push esi + push ebx + sub esp, 8*(limbs+1) + mov edi, esp + +%assign i 0 +%rep 2 * limbs + 1 + mov dword [edi+4*i], 0 + %assign i i + 1 +%endrep + + mov dword [esp+8*limbs+4], limbs + + align 32 + +.1: mov eax, [esp+8*limbs+32] + mov esi, [esp+8*limbs+36] + mov eax, [eax] + mul dword [esi] + add eax, [edi] + mul dword [esp+8*limbs+44] + mov ebp, eax + mov esi, [esp+8*limbs+40] + + pxor mm0, mm0 + movd mm7, ebp + +%assign i 0 +%rep limbs + mseq i + %assign i i + 1 +%endrep + + movd ecx, mm0 + + add [edi+4*limbs], ecx + adc dword [edi+4*limbs+4], 0 + mov eax, [esp+8*limbs+32] + mov ebp, [eax] + mov esi, [esp+8*limbs+36] + + pxor mm0, mm0 + movd mm7, ebp + +%assign i 0 +%rep limbs + mseq i + %assign i i + 1 +%endrep + + movd ecx, mm0 + add [edi+4*limbs], ecx + adc dword [edi+4*limbs+4], 0 + add dword [esp+8*limbs+32], 4 + add edi, 4 + dec dword [esp+8*limbs+4] + jnz .1 + + mov ebx, [esp+8*limbs+28] + +%assign i 0 +%rep limbs + mov eax, [edi+4*i] + mov [ebx+4*i], eax + %assign i i + 1 +%endrep + mov eax, [edi+4*limbs] + add esp, 8*(limbs+1) + + pop ebx + pop esi + pop edi + pop ebp + emms + ret +%endmacro + + bits 32 + section .text + + global _mulredc1 +%ifdef DLL + export _mulredc1 +%endif + +_mulredc1: + mov eax, [esp+12] + mul dword [esp+8] + mov [esp+12], edx + mov [esp+8], eax + mul dword [esp+20] + mul dword [esp+16] + add eax, [esp+8] + adc edx, [esp+12] + mov ecx, [esp+4] + mov [ecx], edx + adc eax, 0 + ret + +%assign i 2 +%rep 19 ; 3..20 inclusive + mulredc i + %assign i i + 1 +%endrep + + end + diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/a_win32p_redc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/a_win32p_redc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/a_win32p_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/a_win32p_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,145 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro rloop 3 + mov eax, [byte esi+4*%3] + mul ebp + add [byte edi+4*%3], %2 + adc %1, eax + mov %2, edx + adc %2, 0 +%endmacro + + bits 32 + section .text + + global _ecm_redc3 +%ifdef DLL + export _ecm_redc3 +%endif + +_ecm_redc3: + push ebp + push edi + push esi + push ebx + sub esp, 16 + + mov ecx, [esp+44] + mov edi, [esp+36] + mov [esp], ecx + cmp ecx, 5 + jae .unroll + +.1: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + xor ebx, ebx + +.2: mov eax, [esi] + add edi, 4 + mul ebp + add esi, 4 + add eax, ebx + adc edx, 0 + add [edi-4], eax + adc edx, 0 + dec ecx + mov ebx, edx + jnz .2 + mov edi, [esp+36] + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .1 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + +.unroll: + mov edx, ecx + dec ecx + sub edx, 2 + neg ecx + shr edx, 4 + and ecx, 15 + mov [esp+8], edx + mov edx, ecx + shl edx, 4 + neg ecx + lea edx, [edx+ecx*1+.loop_base] + mov [esp+44], ecx + mov [esp+12], edx + +.4: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + mov edx, [esp+8] + mov [esp+4], edx + mov eax, [esi] + lea esi, [esi+ecx*4+4] + mul ebp + lea edi, [edi+ecx*4] + mov ebx, edx + mov edx, [esp+12] + test ecx, 1 + mov ecx, eax + cmovnz ecx, ebx + cmovnz ebx, eax + jmp edx + + align 32 +.5: add edi, 64 +.loop_base: + rloop ebx, ecx, 0 + rloop ecx, ebx, 1 + rloop ebx, ecx, 2 + rloop ecx, ebx, 3 + rloop ebx, ecx, 4 + rloop ecx, ebx, 5 + rloop ebx, ecx, 6 + rloop ecx, ebx, 7 + rloop ebx, ecx, 8 + rloop ecx, ebx, 9 + rloop ebx, ecx, 10 + rloop ecx, ebx, 11 + rloop ebx, ecx, 12 + rloop ecx, ebx, 13 + rloop ebx, ecx, 14 + rloop ecx, ebx, 15 + + dec dword [esp+4] + lea esi, [esi+64] + jns .5 + + add [edi+64], ecx + mov edi, [esp+36] + adc ebx, 0 + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .4 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + + end diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/a_x64_mulredc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/a_x64_mulredc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/a_x64_mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/a_x64_mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,237 @@ +; +; Part of GMP-ECM +; +; mp_limb_t mulredc1( MSVC 1 limb +; mp_limb_t *z, rcx +; const mp_limb_t x, rdx +; const mp_limb_t y, r8 +; const mp_limb_t m, r9 +; mp_limb_t inv_m [rsp+0x28] +; ) +; +; mp_limb_t mulredc( MSVC > 1 limb +; mp_limb_t *z, rcx +; const mp_limb_t *x, rdx +; const mp_limb_t *y, r8 +; const mp_limb_t *m, r9 +; mp_limb_t inv_m [rsp+0x28] +; ) + +%macro mseq_1 4 + mov %2, rcx + mul r14 + add %1, rax + mov rax, [r9+8*%3] + adc %2, rdx + mul r11 +%if %3 < %4 - 1 + add rax, %1 + mov [rbp+8*(%3-1)], rax + mov rax, [r8+8*(%3+1)] + adc %2, rdx + setc cl +%else + add %1, rax + mov [rbp+8*(%3-1)], %1 + adc %2, rdx + mov [rbp+8*%3], %2 + setc cl + mov [rbp+8*(%3+1)], rcx +%endif +%endmacro + +%macro mseq_20 2 + mov r14, [r13+r12*8] + mov rax, [r8] + mov %1, [rbp] + mov %2, [rbp+8] + mul r14 + add r12, 1 + add rax, %1 + adc %2, rdx + setc cl + mov %1, rax + imul rax, r10 + mov r11, rax + mul qword [r9] + add %1, rax + adc %2, rdx + mov rax, [r8+8] +%endmacro + +%macro mseq_2 4 + mov %2, [rbp+8*(%3+1)] + adc %2, rcx +%if %3 < %4 - 1 + setc cl +%endif + mul r14 + add %1, rax + mov rax, [r9+8*%3] + adc %2, rdx +%if %3 < %4 - 1 + adc cl, 0 +%else + setc cl +%endif + mul r11 +%if %3 < %4 - 1 + add rax, %1 + mov [rbp+8*(%3-1)], rax + adc %2, rdx + mov rax, [r8+8*(%3+1)] +%else + add %1, rax + mov [rbp+8*(%3-1)], %1 + adc %2, rdx + mov [rbp+8*%3],%2 + adc cl, 0 + mov [rbp+8*(%3+1)], rcx +%endif +%endmacro + +%macro store 1 +%assign i 0 +%rep %1 + %if i == %1 - 1 && (%1 & 1) + mov rax, [rbp+8*i] + mov [rdi+8*i], rax + %elif (i & 1) + mov [rdi+8*(i-1)], rax + mov [rdi+8*i], rdx + %else + mov rax, [rbp+8*i] + mov rdx, [rbp+8*(i+1)] + %endif + %assign i i + 1 +%endrep +%endmacro + +%macro mulredc 1 + +%assign limbs %1 +%define f_name(x) mulredc %+ x +%define stack_space 8 * (limbs + 1 + (limbs & 1)) + + global f_name(limbs) +%ifdef DLL + export f_name(limbs) +%endif + + align 64 + +PROC_FRAME f_name(limbs) ; SEH Frame + push_reg rbp + push_reg rbx + push_reg rsi + push_reg rdi + push_reg r12 + push_reg r13 + push_reg r14 + alloc_stack stack_space +END_PROLOGUE + ; *y in r8 + mov rdi, rcx ; *z -> rdi + mov r13, rdx ; *x -> r13 + mov r10, [rsp+8*12+stack_space] ; invm -> r10 + ; *m in r9 + mov r14, [r13] + mov rax, [r8] + xor rcx, rcx + lea rbp, [rsp] + mov r12, rcx + mul qword r14 + add r12, 1 + mov rsi, rax + mov rbx, rdx + imul rax, r10 + mov r11, rax + mul qword [r9] + add rsi, rax + mov rax, [r8+8] + adc rbx, rdx + setc cl + +%assign j 1 +%rep limbs - 1 +%if (j & 1) + mseq_1 rbx, rsi, j, limbs +%else + mseq_1 rsi, rbx, j, limbs +%endif + %assign j j + 1 +%endrep + + align 32 +.1: + +%assign j 1 +%if (limbs & 1) + mseq_20 rsi, rbx + %rep limbs - 1 + %if (j & 1) + mseq_2 rbx, rsi, j, limbs + %else + mseq_2 rsi, rbx, j, limbs + %endif + %assign j j + 1 + %endrep +%else + mseq_20 rbx, rsi + %rep limbs - 1 + %if (j & 1) + mseq_2 rsi, rbx, j, limbs + %else + mseq_2 rbx, rsi, j, limbs + %endif + %assign j j + 1 + %endrep +%endif + + cmp r12, limbs + jb .1 + + store limbs + + mov rax, rcx + add rsp, stack_space + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + pop rbp + ret +ENDPROC_FRAME +%endmacro + + bits 64 + section .text + + global mulredc1 +%ifdef DLL + export mulredc1 +%endif + + align 64 +mulredc1: + mov rax, r8 + mul rdx + mov r10, rax + mov r11, rdx + mul qword [rsp+0x28] + mul r9 + add rax, r10 + adc rdx, r11 + mov [rcx], rdx + adc rax, 0 + ret + +%assign i 2 +%rep 19 ; 2..20 inclusive + mulredc i + %assign i i + 1 +%endrep + + end diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/a_x64_redc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/a_x64_redc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/a_x64_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/a_x64_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,161 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro rloop 3 + mov rax,[byte rsi+8*%3] + mul rbp + add [byte rdi+8*%3], %1 + adc %2, rax + mov %1, rdx + adc %1, 0 +%endmacro + + bits 64 + section .text + + global ecm_redc3 +%ifdef DLL + export ecm_redc3 +%endif + +PROC_FRAME ecm_redc3 + push_reg rbp + push_reg rbx + push_reg rsi + push_reg rdi + alloc_stack 5*8 +END_PROLOGUE + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + + mov r8, rdi + mov r9, rsi + mov r10, rdx + mov r11, rcx + + mov rcx, r10 + mov [rsp], rcx + cmp rcx, 3 + jae .unroll + +.1: mov rbp, r11 + mov rsi, r9 + imul rbp, [rdi] + mov r8, rdi + mov rcx, r10 + xor rbx, rbx + +.2: mov rax, [rsi] + add rdi, 8 + mul rbp + add rsi, 8 + add rax, rbx + adc rdx, 0 + add [rdi-8], rax + adc rdx, 0 + dec rcx + mov rbx, rdx + jnz .2 + mov rdi, r8 + mov [rdi], rbx + dec qword [rsp] + lea rdi, [rdi+8] + jnz .1 + + add rsp, 5*8 + pop rdi + pop rsi + pop rbx + pop rbp + ret + +.unroll: + mov rdx, rcx + dec rcx + sub rdx, 2 + neg rcx + shr rdx, 4 + and rcx, 15 + mov [rsp+16], rdx + mov rdx, rcx + shl rdx, 4 + lea r10, [.loop_base wrt rip] + add rdx, r10 + lea rdx, [rdx+rcx*4] + add rdx, rcx + neg rcx + mov r10, rcx + mov [rsp+24], rdx + +.4: mov rbp, r11 + mov rsi, r9 + imul rbp, [rdi] + mov r8, rdi + mov rcx, r10 + mov rdx, [rsp+16] + mov [rsp+8], rdx + + mov rax, [rsi] + lea rsi, [rsi+rcx*8+8] + mul rbp + lea rdi, [rdi+rcx*8] + mov rbx, rdx + + mov rdx, [rsp+24] + test rcx, 1 + mov rcx, rax + cmovnz rcx, rbx + cmovnz rbx, rax + jmp rdx + + align 64 + +.5: add rdi, 128 +.loop_base: + rloop rcx, rbx, 0 + rloop rbx, rcx, 1 + rloop rcx, rbx, 2 + rloop rbx, rcx, 3 + rloop rcx, rbx, 4 + rloop rbx, rcx, 5 + rloop rcx, rbx, 6 + rloop rbx, rcx, 7 + rloop rcx, rbx, 8 + rloop rbx, rcx, 9 + rloop rcx, rbx, 10 + rloop rbx, rcx, 11 + rloop rcx, rbx, 12 + rloop rbx, rcx, 13 + rloop rcx, rbx, 14 + rloop rbx, rcx, 15 + + dec qword [rsp+8] + lea rsi, [rsi+128] + jns .5 + + add [rdi+128], rcx + mov rdi, r8 + adc rbx, 0 + mov [rdi], rbx + dec qword [rsp] + lea rdi, [rdi+8] + jnz .4 + + add rsp, 5*8 + pop rdi + pop rsi + pop rbx + pop rbp + ret +ENDPROC_FRAME + + end diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/assembler/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/assembler/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +EXTRA_DIST = a_win32a_mulredc.asm a_win32a_redc.asm a_win32p_mulredc.asm \ + a_win32p_redc.asm a_x64_mulredc.asm a_x64_redc.asm \ + test_mulredc.c mulredc.h mulredc.asm redc.asm diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/mulredc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/mulredc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,8 @@ + +%ifdef _WIN64 +%include "a_x64_mulredc.asm" +%elifdef AMD_ASM +%include "a_win32a_mulredc.asm" +%else +%include "a_win32p_mulredc.asm" +%endif diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/mulredc.h gmp-ecm-7.0.5+ds/build.vc14/assembler/mulredc.h --- gmp-ecm-7.0.4+ds/build.vc14/assembler/mulredc.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/mulredc.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,32 @@ +#ifndef __ASM_REDC_H__ +#define __ASM_REDC_H__ + +#include + +extern void ecm_redc3(mp_limb_t *cp, const mp_limb_t *np, mp_size_t nn, mp_limb_t Nprim); + + +/* WARNING: the size-1 version doesn't take pointers in input */ +extern mp_limb_t mulredc1(mp_limb_t *z, mp_limb_t x, mp_limb_t y, mp_limb_t m, mp_limb_t inv_m); + +extern mp_limb_t mulredc2(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc3(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc4(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc5(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc6(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc7(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc8(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc9(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc10(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc11(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc12(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc13(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc14(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc15(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc16(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc17(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc18(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc19(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc20(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/redc.asm gmp-ecm-7.0.5+ds/build.vc14/assembler/redc.asm --- gmp-ecm-7.0.4+ds/build.vc14/assembler/redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,7 @@ +%ifdef _WIN64 +%include "a_x64_redc.asm" +%elif AMD_ASM +%include "a_win32a_redc.asm" +%else +%include "a_win32p_redc.asm" +%endif diff -Nru gmp-ecm-7.0.4+ds/build.vc14/assembler/test_mulredc.c gmp-ecm-7.0.5+ds/build.vc14/assembler/test_mulredc.c --- gmp-ecm-7.0.4+ds/build.vc14/assembler/test_mulredc.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/assembler/test_mulredc.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,303 @@ +#include +#include +#include + +#include + +#include "asmredc.h" + +void mp_print(mp_limb_t *x, int N) { + int i; + for (i = 0; i < N-1; ++i) + printf("%lu + W*(", x[i]); + printf("%lu", x[N-1]); + for (i = 0; i < N-1; ++i) + printf(")"); + printf("\n"); +} + +static mp_limb_t +call_mulredc (int N, mp_limb_t *z, mp_limb_t *x, mp_limb_t *y, mp_limb_t *m, + mp_limb_t invm) +{ + mp_limb_t cy; + + switch (N) + { + case 1: + cy = mulredc1(z, x[0], y[0], m[0], invm); + break; + case 2: + cy = mulredc2(z, x, y, m, invm); + break; + case 3: + cy = mulredc3(z, x, y, m, invm); + break; + case 4: + cy = mulredc4(z, x, y, m, invm); + break; + case 5: + cy = mulredc5(z, x, y, m, invm); + break; + case 6: + cy = mulredc6(z, x, y, m, invm); + break; + case 7: + cy = mulredc7(z, x, y, m, invm); + break; + case 8: + cy = mulredc8(z, x, y, m, invm); + break; + case 9: + cy = mulredc9(z, x, y, m, invm); + break; + case 10: + cy = mulredc10(z, x, y, m, invm); + break; + case 11: + cy = mulredc11(z, x, y, m, invm); + break; + case 12: + cy = mulredc12(z, x, y, m, invm); + break; + case 13: + cy = mulredc13(z, x, y, m, invm); + break; + case 14: + cy = mulredc14(z, x, y, m, invm); + break; + case 15: + cy = mulredc15(z, x, y, m, invm); + break; + case 16: + cy = mulredc16(z, x, y, m, invm); + break; + case 17: + cy = mulredc17(z, x, y, m, invm); + break; + case 18: + cy = mulredc18(z, x, y, m, invm); + break; + case 19: + cy = mulredc19(z, x, y, m, invm); + break; + case 20: + cy = mulredc20(z, x, y, m, invm); + break; + default: + cy = mulredc20(z, x, y, m, invm); + } + return cy; +} + +void test(mp_size_t N, int k) +{ + mp_limb_t *x, *y, *yp, *z, *m, invm, cy, cy2, *tmp, *tmp2, *tmp3; + int i, j; + + x = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + y = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + z = (mp_limb_t *) malloc((N+1)*sizeof(mp_limb_t)); + m = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + tmp = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + tmp2 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + tmp3 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + + if (x == NULL || y == NULL || z == NULL || m == NULL || tmp == NULL || + tmp2 == NULL || tmp3 == NULL) + { + fprintf (stderr, "Cannot allocate memory in test_mulredc\n"); + exit (1); + } + + mpn_random2(m, N); + m[0] |= 1UL; + if (m[N-1] == 0) + m[N-1] = 1UL; + + invm = 1UL; + for (i = 0; i < 10; ++i) + invm = (2*invm-m[0]*invm*invm); + invm = -invm; + + assert( (invm*m[0] +1UL) == 0UL); + + yp = y; + for (i=0; i < k; ++i) { + /* Try a few special cases */ + if (i == 0) + { + /* Try all 0, product should be 0 */ + for (j = 0; j < N; j++) + x[j] = y[j] = 0; + } + else if (i == 1) + { + /* Try all 1 */ + for (j = 0; j < N; j++) + x[j] = y[j] = 1; + } + else if (i == 2) + { + /* Try all 2^wordsize - 1 */ + for (j = 0; j < N; j++) + x[j] = y[j] = ~(0UL); + } + else + { + /* In the other cases, try random data */ + if (i % 2 == 0) + { + /* Try squaring */ + mpn_random2(x, N); + yp = x; + } + else + { + /* Try multiplication */ + mpn_random2(x, N); + mpn_random2(y, N); + } + } + + // Mul followed by ecm_redc3 + mpn_mul_n(tmp, x, yp, N); + ecm_redc3(tmp, m, N, invm); + cy2 = mpn_add_n (tmp2, tmp + N, tmp, N); + + // Mixed mul and redc + cy = call_mulredc (N, z, x, yp, m, invm); + + if (cy != cy2) + printf ("i = %d: mulredc cy = %ld, mpn_mul_n/ecm_redc3 cy = %ld\n", + i, (long) cy, (long) cy2); + assert (cy == cy2); + if (mpn_cmp(z,tmp2, N) != 0) + { + printf ("i = %d\nmulredc = ", i); + for (j = N - 1; j >= 0; j--) + printf ("%lx ", z[j]); + printf ("\nmpn_mul_n/ecm_redc3 = "); + for (j = N - 1; j >= 0; j--) + printf ("%lx ", tmp2[j]); + printf ("\n"); + assert (mpn_cmp(z,tmp2, N) == 0); + } + + if (cy) + printf("!"); + z[N] = cy; + // Check with pure gmp : multiply by 2^(N*GMP_NUMB_BITS) and compare. + for (j=0; j < N; ++j) { + tmp[j] = 0; + tmp[j+N] = z[j]; + } + tmp[2*N] = z[N]; + mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N+1, m, N); + for (j=0; j < N; ++j) + z[j] = tmp3[j]; + + mpn_mul_n(tmp, x, yp, N); + mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N, m, N); + + assert(mpn_cmp(z, tmp3, N) == 0); + } + + free(tmp); free(tmp2); free(tmp3); + free(x); free(y); free(z); free(m); +} + + + +int main(int argc, char** argv) +{ + int i, len; + + if (argc > 1) /* Test a specific length */ + { + len = atoi (argv[1]); + for (i = 0; i < 1; i++) + test (len, 1000000); + return 0; + } + + for (;;) { + for (i = 1; i <= 20; ++i) { + test(i, 1000); + } +#if 0 + test(1, 1000); + test(2, 1000); + test(3, 1000); + test(4, 1000); + test(5, 1000); + test(6, 1000); + test(7, 1000); + test(8, 1000); + test(9, 1000); + test(10, 1000); + test(11, 1000); + test(12, 1000); + test(13, 100); + test(14, 100); + test(15, 100); + test(16, 100); + test(17, 100); + test(18, 100); + test(44, 10); + test(45, 10); + test(46, 10); + test(47, 10); + test(48, 10); + test(49, 10); +#endif + printf("."); fflush(stdout); + } +#if 0 + x[0] = 12580274668139321508UL; + x[1] = 9205793975152560417UL; + x[2] = 7857372727033793057UL; + + y[0] = 13688385828267279103UL; + y[1] = 10575011835742767258UL; + y[2] = 8802048318027595690UL; + + + m[0] = 2981542467342508025UL; + m[1] = 5964669706257742025UL; + m[2] = 18446744073678090270UL; + + invm = 9419286575570128311UL; + + carry = mulredc(z, x, y, m, 3, invm); + + printf("%lu + 2^64*(%lu + 2^64*%lu), carry=%lu\n", z[0], z[1], z[2], carry); +#endif + return 0; +} + + +#if 0 + +W := 2^64; + +x0:= 12580274668139321508; +x1:= 9205793975152560417; +x2:= 7857372727033793057; +x := x0 + W*(x1 + W*x2); + +y0:= 13688385828267279103; +y1:= 10575011835742767258; +y2:= 8802048318027595690; +y := y0 + W*(y1 + W*y2); + +m0:= 2981542467342508025; +m1:= 5964669706257742025; +m2:= 18446744073678090270; +m := m0 + W*(m1 + W*m2); + +invm := 9419286575570128311; + + + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj gmp-ecm-7.0.5+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,170 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {4727DE12-787D-432D-B166-BF103B0C3C87} + Win32Proj + bench_mulredc + + + + Application + true + v140 + + + Application + true + v140 + + + Application + false + true + v140 + + + Application + false + true + v140 + + + + + + + + + + + + + + + + + + + + + + + true + $(SolutionDir)..bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + true + $(SolutionDir)..bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ + MultiThreadedDebug + + + Console + true + psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + + + Level3 + Disabled + _WIN64;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ + MultiThreadedDebug + + + Console + true + psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ + MultiThreaded + + + Console + true + true + true + psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + _WIN64;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ + MultiThreaded + + + Console + true + true + true + psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/bench_mulredc/bench_mulredc.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,23 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/bench_mulredc/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/bench_mulredc/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/bench_mulredc/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/bench_mulredc/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = bench_mulredc.vcxproj bench_mulredc.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/config.h gmp-ecm-7.0.5+ds/build.vc14/config.h --- gmp-ecm-7.0.4+ds/build.vc14/config.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/config.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,246 @@ +/* config.h.in. Generated from configure.in by autoheader. */ + +#define VERSION ECM_VERSION + +#define VERSION_GPU "gpu_ecm-win" + +#define PACKAGE_BUGREPORT "ecm-discuss@inria.fr" + +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP + systems. This function is required for `alloca.c' support on those systems. + */ +#undef CRAY_STACKSEG_END + +/* Define to 1 if using `alloca.c'. */ +#define C_ALLOCA 1 + +/* Define to 1 if you have the `access' function. */ +#undef HAVE_ACCESS + +/* Define to 1 if you have `alloca', as a function or macro. */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#undef HAVE_ALLOCA_H + +/* Define to 1 if you have the `ctime' function. */ +#define HAVE_CTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_CTYPE_H 1 + +/* Define to 1 if you have the `floor' function. */ +#define HAVE_FLOOR 1 + +/* Define to 1 if you have the `fmod' function. */ +#define HAVE_FMOD 1 + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `gettimeofday' function. */ +#undef HAVE_GETTIMEOFDAY + +/* Define to 1 if you have the header file. */ +#define HAVE_GMP_H 1 + +/* Define to 1 if gwnum.a or gwnum.lib exist */ +#undef HAVE_GWNUM + +/* Define to 1 if you have the header file. */ +#undef HAVE_INTTYPES_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_IO_H + +/* Define to 1 if you have the `isascii' function. */ +#undef HAVE_ISASCII + +/* Define to 1 if you have the `isdigit' function. */ +#define HAVE_ISDIGIT 1 + +/* Define to 1 if you have the `isspace' function. */ +#define HAVE_ISSPACE 1 + +/* Define to 1 if you have the `isxdigit' function. */ +#define HAVE_ISXDIGIT 1 + +/* Define to 1 if you have the `m' library (-lm). */ +#undef HAVE_LIBM + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_H 1 + +/* Define to 1 if you have the `malloc_usable_size' function. */ +#undef HAVE_MALLOC_USABLE_SIZE + +/* Define to 1 if you have the header file. */ +#define HAVE_MATH_H 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `nice' function. */ +#undef HAVE_NICE + +/* Define to 1 if you have the `pow' function. */ +#define HAVE_POW 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SIGNAL_H 1 + +/* Define to 1 if you have the `sqrt' function. */ +#define HAVE_SQRT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strlen' function. */ +#define HAVE_STRLEN 1 + +/* Define to 1 if you have the `strncasecmp' function. */ +#undef HAVE_STRNCASECMP + +/* Define to 1 if you have the `strstr' function. */ +#undef HAVE_STRSTR + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_RESOURCE_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `time' function. */ +#undef HAVE_TIME + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Define to 1 if you have the `unlink' function. */ +#define HAVE_UNLINK 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_WINDOWS_H 1 + +/* Define to 1 if you have the `__gmpn_add_nc' function. */ +#if defined( _WIN64 ) +# define HAVE___GMPN_ADD_NC 1 +#endif + +/* Define to 1 if you have the `__gmpn_mod_34lsub1' function. */ +#define HAVE___GMPN_MOD_34LSUB1 1 + +/* Define to 1 if you have the `__gmpn_mul_fft' function. */ +#define HAVE___GMPN_MUL_FFT 1 + +/* Define to 1 if you want memory debugging */ +#undef MEMORY_DEBUG + +/* Define if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 +#define HAVE_LONG_LONG_INT 1 + +/* Define to 1 to use asm redc on x86 or x86_64 */ +# define NATIVE_REDC 1 + +/* Define to 1 if your C compiler doesn't accept -c and -o together. */ +#undef NO_MINUS_C_MINUS_O + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at runtime. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ +#undef STACK_DIRECTION + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#undef TIME_WITH_SYS_TIME + +/* Define to 1 if you want assertions enabled */ +#undef WANT_ASSERT + +/* Define to 1 if you want shell command execution */ +#undef WANT_SHELLCMD + +/* Define to empty if `const' does not conform to ANSI C. */ +#undef const + +/* How to specify hot-spot attribute, if available */ +#define ATTRIBUTE_HOT + +#define HAVE___GMPN_REDC_1 1 + +#define HAVE___GMPN_REDC_2 1 + +#define HAVE_ASM_REDC3 1 + +#define WINDOWS64_ABI 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#define inline __inline +#endif + +/* Define to `unsigned int' if does not define. */ +#undef size_t + +#define PRIdSIZE "Id" +#define PRIuSIZE "Iu" + +#ifdef _MSC_VER + +#define __func__ __FUNCTION__ + +/* define Windows tuning here */ +# define __tune_corei7__ + +# if _MSC_VER < 1600 +# define int64_t __int64 +# define uint64_t unsigned __int64 +# endif +# define strncasecmp strnicmp +# define access _access +# define alloca _alloca +# define fseek64 _fseek64 +# define ftell64 _ftell64 +# define omp_get_thread_limit omp_get_max_threads +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm/ecm.vcxproj gmp-ecm-7.0.5+ds/build.vc14/ecm/ecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/ecm/ecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm/ecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,239 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} + ecm + Win32Proj + 8.1 + + + + Application + v110 + + + Application + v120 + + + Application + v140 + + + Application + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + false + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + false + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm/ecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/ecm/ecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/ecm/ecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm/ecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,74 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/ecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/ecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = ecm.vcxproj ecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,286 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {1B353D8B-9808-4EB3-A5E7-075D751757AD} + ecm_gpu + Win32Proj + 8.1 + + + + Application + v140 + + + Application + v140 + + + Application + v140 + + + Application + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\$(Platform)\cudart.lib + false + Console + true + true + false + + + MachineX86 + + + compute_50,sm_50 + + + 32 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\$(Platform)\cudart.lib + true + Console + false + + + MachineX86 + + + compute_50,sm_50 + + + 32 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\$(Platform)\cudart.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + compute_50,sm_50 + + + 64 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\$(Platform)\cudart.lib + true + + + compute_50,sm_50 + + + 64 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + true + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/ecm_gpu.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,78 @@ + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + {2a13feaf-0c0e-469a-8047-82c647322da9} + + + {163547c7-89d7-4ddc-b0ad-02b4cfd722b4} + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/ecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/ecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/ecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/ecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,68 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = ecm_gpu.vcxproj ecm_gpu.vcxproj.filters libecm_gpu.vcxproj libecm_gpu.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu.sln gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu.sln --- gmp-ecm-7.0.4+ds/build.vc14/ecm_gpu.sln 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm_gpu.sln 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,39 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm_gpu", "libecm_gpu\libecm_gpu.vcxproj", "{3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm_gpu", "ecm_gpu\ecm_gpu.vcxproj", "{1B353D8B-9808-4EB3-A5E7-075D751757AD}" + ProjectSection(ProjectDependencies) = postProject + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} = {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|Win32.ActiveCfg = Debug|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|Win32.Build.0 = Debug|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|x64.ActiveCfg = Debug|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|x64.Build.0 = Debug|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|Win32.ActiveCfg = Release|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|Win32.Build.0 = Release|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|x64.ActiveCfg = Release|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|x64.Build.0 = Release|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|Win32.ActiveCfg = Debug|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|Win32.Build.0 = Debug|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|x64.ActiveCfg = Debug|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|x64.Build.0 = Debug|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|Win32.ActiveCfg = Release|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|Win32.Build.0 = Release|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|x64.ActiveCfg = Release|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru gmp-ecm-7.0.4+ds/build.vc14/ecm.sln gmp-ecm-7.0.5+ds/build.vc14/ecm.sln --- gmp-ecm-7.0.4+ds/build.vc14/ecm.sln 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/ecm.sln 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,70 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.24720.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm", "libecm\libecm.vcxproj", "{CD555681-D65B-4173-A29C-B8BF06A4871B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm", "ecm\ecm.vcxproj", "{C0E2EA85-996A-4B5F-AD30-590FAF5B7187}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tune", "tune\tune.vcxproj", "{80E08750-5C6C-492E-BB1E-7200978AE125}" + ProjectSection(ProjectDependencies) = postProject + {CD555681-D65B-4173-A29C-B8BF06A4871B} = {CD555681-D65B-4173-A29C-B8BF06A4871B} + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} = {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bench_mulredc", "bench_mulredc\bench_mulredc.vcxproj", "{4727DE12-787D-432D-B166-BF103B0C3C87}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "multiecm", "multiecm\multiecm.vcxproj", "{16434DC2-371C-451B-A336-820499B98B8C}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.ActiveCfg = Debug|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.Build.0 = Debug|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.ActiveCfg = Debug|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.Build.0 = Debug|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.ActiveCfg = Release|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.Build.0 = Release|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.ActiveCfg = Release|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.Build.0 = Release|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.ActiveCfg = Debug|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.Build.0 = Debug|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.ActiveCfg = Debug|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.Build.0 = Debug|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.ActiveCfg = Release|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.Build.0 = Release|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.ActiveCfg = Release|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.Build.0 = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|Win32.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.Build.0 = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.ActiveCfg = Release|Win32 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.Build.0 = Release|Win32 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.Build.0 = Release|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.ActiveCfg = Debug|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.Build.0 = Debug|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.ActiveCfg = Debug|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.Build.0 = Debug|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.ActiveCfg = Release|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.Build.0 = Release|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.ActiveCfg = Release|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.Build.0 = Release|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|Win32.ActiveCfg = Debug|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|Win32.Build.0 = Debug|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|x64.ActiveCfg = Debug|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|x64.Build.0 = Debug|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|Win32.ActiveCfg = Release|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|Win32.Build.0 = Release|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|x64.ActiveCfg = Release|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru gmp-ecm-7.0.4+ds/build.vc14/file_copy.bat gmp-ecm-7.0.5+ds/build.vc14/file_copy.bat --- gmp-ecm-7.0.4+ds/build.vc14/file_copy.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/file_copy.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,4 @@ +if not exist %1 ( echo file_copy failure: %1 not found && goto exit ) +if exist %2 ( fc %1 %2 > nul && if not %errorlevel 1 goto exit ) +echo copying %1 to %2 && copy %1 %2 +:exit diff -Nru gmp-ecm-7.0.4+ds/build.vc14/gen_ecm_h.bat gmp-ecm-7.0.5+ds/build.vc14/gen_ecm_h.bat --- gmp-ecm-7.0.4+ds/build.vc14/gen_ecm_h.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/gen_ecm_h.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,13 @@ +@echo off +echo creating ecm.h from ecm.h.in +echo /* generated from ecm-h.in by gen_ecm_h.bat */>tmp.h + +for /f "tokens=1,2*" %%a in (..\ecm.h.in) do ( + if "%%a" EQU "#undef" ( + if "%%b" EQU "ECM_VERSION" ( + echo #define ECM_VERSION "7.0.5">>tmp.h + ) + ) else echo %%a %%b %%c>>tmp.h +) + +call out_copy_rename tmp.h ..\ ecm.h diff -Nru gmp-ecm-7.0.4+ds/build.vc14/getopt.c gmp-ecm-7.0.5+ds/build.vc14/getopt.c --- gmp-ecm-7.0.4+ds/build.vc14/getopt.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/getopt.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,1281 @@ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001,2002 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ + +#define HAVE_STRING_H 1 + +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + +#ifdef HAVE_CONFIG_H +# include +#endif + +#if !defined __STDC__ || !__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. */ +# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC +# include +# ifndef _ +# define _(msgid) gettext (msgid) +# endif +# else +# define _(msgid) (msgid) +# endif +# if defined _LIBC && defined USE_IN_LIBIO +# include +# endif +#endif + +#ifndef attribute_hidden +# define attribute_hidden +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + +#include "getopt.h" + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized attribute_hidden; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# include +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined __STDC__ || !__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not __STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Stored original parameters. + XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ +extern int __libc_argc; +extern char **__libc_argv; + +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +# ifdef USE_NONOPTION_FLAGS +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; +# endif + +# ifdef USE_NONOPTION_FLAGS +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +# else +# define SWAP_FLAGS(ch1, ch2) +# endif +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined __STDC__ && __STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined __STDC__ && __STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + if (posixly_correct == NULL + && argc == __libc_argc && argv == __libc_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int *longind; + int long_only; +{ + int print_errors = opterr; + if (optstring[0] == ':') + print_errors = 0; + + if (argc < 1) + return -1; + + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#if defined _LIBC && defined USE_NONOPTION_FLAGS +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else if (long_only + || pfound->has_arg != p->has_arg + || pfound->flag != p->flag + || pfound->val != p->val) + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]) >= 0) + { + + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); +#endif + } + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (argv[optind - 1][1] == '-') + { + /* --option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("\ +%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); +#else + fprintf (stderr, _("\ +%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); +#endif + } + else + { + /* +option or -option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("\ +%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], + pfound->name); +#else + fprintf (stderr, _("\ +%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); +#endif + } + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (argv[optind][1] == '-') + { + /* --option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); +#else + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); +#endif + } + else + { + /* +option or -option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); +#else + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (posixly_correct) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: illegal option -- %c\n"), + argv[0], c); +#else + fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c); +#endif + } + else + { +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: invalid option -- %c\n"), + argv[0], c); +#else + fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, + _("%s: option requires an argument -- %c\n"), + argv[0], c) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); +#endif + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); +#endif + } + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); +#endif + } + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); +#endif + } + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option requires an argument -- %c\n"), + argv[0], c) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); +#endif + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int *) 0, + 0); +} + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +/* Compile with -DTEST to make an executable for use in testing + the above definition of `getopt'. */ + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + + c = getopt (argc, argv, "abc:d:0123456789"); + if (c == -1) + break; + + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ + diff -Nru gmp-ecm-7.0.4+ds/build.vc14/getopt.h gmp-ecm-7.0.5+ds/build.vc14/getopt.h --- gmp-ecm-7.0.4+ds/build.vc14/getopt.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/getopt.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,182 @@ +/* Declarations for getopt. + Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _GETOPT_H + +#ifndef __need_getopt +# define _GETOPT_H 1 +#endif + +/* If __GNU_LIBRARY__ is not already defined, either we are being used + standalone, or this is the first header included in the source file. + If we are being used with glibc, we need to include , but + that does not exist if we are standalone. So: if __GNU_LIBRARY__ is + not defined, include , which will pull in for us + if it's from glibc. (Why ctype.h? It's guaranteed to exist and it + doesn't flood the namespace with stuff the way some other headers do.) */ +#if !defined __GNU_LIBRARY__ +# include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#ifndef __need_getopt +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +# if (defined __STDC__ && __STDC__) || defined __cplusplus + const char *name; +# else + char *name; +# endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +# define no_argument 0 +# define required_argument 1 +# define optional_argument 2 +#endif /* need getopt */ + + +/* Get definitions and prototypes for functions to process the + arguments in ARGV (ARGC of them, minus the program name) for + options given in OPTS. + + Return the option character from OPTS just read. Return -1 when + there are no more options. For unrecognized options, or options + missing arguments, `optopt' is set to the option letter, and '?' is + returned. + + The OPTS string is a list of characters which are recognized option + letters, optionally followed by colons, specifying that that letter + takes an argument, to be placed in `optarg'. + + If a letter in OPTS is followed by two colons, its argument is + optional. This behavior is specific to the GNU `getopt'. + + The argument `--' causes premature termination of argument + scanning, explicitly telling `getopt' that there are no more + options. + + If OPTS begins with `--', then non-option arguments are treated as + arguments to the option '\0'. This behavior is specific to the GNU + `getopt'. */ + +#if (defined __STDC__ && __STDC__) || defined __cplusplus +# ifdef __GNU_LIBRARY__ +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int ___argc, char *const *___argv, const char *__shortopts); +# else /* not __GNU_LIBRARY__ */ +extern int getopt (); +# endif /* __GNU_LIBRARY__ */ + +# ifndef __need_getopt +extern int getopt_long (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); +extern int getopt_long_only (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind, + int __long_only); +# endif +#else /* not __STDC__ */ +extern int getopt (); +# ifndef __need_getopt +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +# endif +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +/* Make sure we later can get all the definitions and declarations. */ +#undef __need_getopt + +#endif /* getopt.h */ + diff -Nru gmp-ecm-7.0.4+ds/build.vc14/getrusage.c gmp-ecm-7.0.5+ds/build.vc14/getrusage.c --- gmp-ecm-7.0.4+ds/build.vc14/getrusage.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/getrusage.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,71 @@ +/* This file is part of the MPIR Library. + + The MPIR Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, or (at + your option) any later version. + The MPIR Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + You should have received a copy of the GNU Lesser General Public License + along with the MPIR Library; see the file COPYING.LIB. If not, write + to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#define WIN32_LEAN_AND_MEAN + +#include +#include +#include + +#include "getrusage.h" + +typedef union file_t +{ FILETIME ft; + long long lt; +} file_t; + +int getrusage(int who, rusage *usage) +{ + HANDLE proc_hand; + file_t c_time, x_time, s_time, u_time; + int cb = 0, err = -1; + + if(who != RUSAGE_SELF) + { + errno = (who == RUSAGE_CHILDREN ? ENODATA : EINVAL); + return err; + } + + proc_hand = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, GetCurrentProcessId()); + + if(GetProcessTimes(proc_hand, &(c_time.ft), &(x_time.ft), &(s_time.ft), &(u_time.ft))) + { + PROCESS_MEMORY_COUNTERS ctrs; + + /* The units returned by GetProcessTimes are 100 nanoseconds */ + u_time.lt = (u_time.lt + 5) / 10; + s_time.lt = (s_time.lt + 5) / 10; + + usage->ru_utime.tv_sec = (long)(u_time.lt / 1000000ll); + usage->ru_stime.tv_sec = (long)(s_time.lt / 1000000ll); + usage->ru_utime.tv_usec = (long)(u_time.lt % 1000000ll); + usage->ru_stime.tv_usec = (long)(s_time.lt % 1000000ll); + + if(GetProcessMemoryInfo(proc_hand, &ctrs, sizeof(ctrs))) + { + PERFORMANCE_INFORMATION perf_info; + GetPerformanceInfo(&perf_info, sizeof(perf_info)); + usage->ru_maxrss = (DWORD) (ctrs.WorkingSetSize / perf_info.PageSize); + usage->ru_majflt = ctrs.PageFaultCount; + err = 0; + } + } + + if(err) + errno = EACCES; + CloseHandle(proc_hand); + return err; +} diff -Nru gmp-ecm-7.0.4+ds/build.vc14/getrusage.h gmp-ecm-7.0.5+ds/build.vc14/getrusage.h --- gmp-ecm-7.0.4+ds/build.vc14/getrusage.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/getrusage.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,46 @@ + +#ifndef _GETRUSAGE_H +#define _GETRUSAGE_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define ENODATA 61 +#define RUSAGE_SELF 0 +#define RUSAGE_CHILDREN -1 + +typedef struct +{ + long tv_sec; + long tv_usec; +} tval; + +typedef struct rusage +{ + tval ru_utime; /* user time used */ + tval ru_stime; /* system time used */ + long ru_maxrss; /* integral max resident set size */ + long ru_ixrss; /* integral shared text memory size */ + long ru_idrss; /* integral unshared data size */ + long ru_isrss; /* integral unshared stack size */ + long ru_minflt; /* page reclaims */ + long ru_majflt; /* page faults */ + long ru_nswap; /* swaps */ + long ru_inblock; /* block input operations */ + long ru_oublock; /* block output operations */ + long ru_msgsnd; /* messages sent */ + long ru_msgrcv; /* messages received */ + long ru_nsignals;/* signals received */ + long ru_nvcsw; /* voluntary context switches */ + long ru_nivcsw; /* involuntary context switches */ +} rusage; + +int getrusage(int who, rusage *usage); + +#if defined(__cplusplus) +} +#endif + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc14/gettimeofday.c gmp-ecm-7.0.5+ds/build.vc14/gettimeofday.c --- gmp-ecm-7.0.4+ds/build.vc14/gettimeofday.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/gettimeofday.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,39 @@ + +#define WIN32_LEAN_AND_MEAN +#include +#include + +#include "gettimeofday.h" + +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + FILETIME ft; + LARGE_INTEGER li; + __int64 t; + static int tzflag; + + if(tv) + { + GetSystemTimeAsFileTime(&ft); + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + t = li.QuadPart; + t -= EPOCHFILETIME; + t /= 10; + tv->tv_sec = (long)(t / 1000000); + tv->tv_usec = (long)(t % 1000000); + } + + if (tz) + { + if (!tzflag) + { + _tzset(); + tzflag++; + } + tz->tz_minuteswest = _timezone / 60; + tz->tz_dsttime = _daylight; + } + + return 0; +} diff -Nru gmp-ecm-7.0.4+ds/build.vc14/gettimeofday.h gmp-ecm-7.0.5+ds/build.vc14/gettimeofday.h --- gmp-ecm-7.0.4+ds/build.vc14/gettimeofday.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/gettimeofday.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,34 @@ +/* + * timeval.h 1.0 01/12/19 + * + * Defines gettimeofday, timeval, etc. for Win32 + * + * By Wu Yongwei + * + */ +#ifndef _TIMEVAL_H +#define _TIMEVAL_H + +#include + +#define EPOCHFILETIME (116444736000000000LL) + +#if defined(__cplusplus) +extern "C" +{ +#endif + +struct timezone +{ + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +int gettimeofday(struct timeval *tv, struct timezone *tz); + +#if defined(__cplusplus) +} +#endif + +#endif /* _TIMEVAL_H */ + diff -Nru gmp-ecm-7.0.4+ds/build.vc14/libecm/libecm.vcxproj gmp-ecm-7.0.5+ds/build.vc14/libecm/libecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/libecm/libecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/libecm/libecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,245 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {CD555681-D65B-4173-A29C-B8BF06A4871B} + libecm + Win32Proj + + + + StaticLibrary + v120 + + + StaticLibrary + v120 + + + StaticLibrary + v140 + + + StaticLibrary + Static + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + ..\prebuild NO_GPU + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + Default + true + + + + + + ..\prebuild NO_GPU + + + X64 + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + Default + true + + + + + _WIN64 + + + + + ..\prebuild NO_GPU + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + + + ..\prebuild NO_GPU + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + + _WIN64 + + + + + + + + + + + + + + + + + + + + + + + true + true + true + true + + + + + + + + + + + + + + + + + + + Full + + + Full + + + Full + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/libecm/libecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/libecm/libecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/libecm/libecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/libecm/libecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,190 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {2f18179f-5dba-420c-8dc7-bc7f8228a1b2} + + + + + Source Files\Assembler + + + Source Files\Assembler + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/libecm/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/libecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/libecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/libecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = libecm.vcxproj libecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj gmp-ecm-7.0.5+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,315 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} + libecm_gpu + Win32Proj + 8.1 + + + + StaticLibrary + v140 + + + StaticLibrary + v140 + + + StaticLibrary + v140 + + + StaticLibrary + Static + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + ecmlib + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + ..\prebuild GPU + + + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + Default + Full + + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 32 + + + true + + + + + ..\prebuild GPU + + + X64 + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + Default + true + + + + + _WIN64 + + + compute_50,sm_50 + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 64 + + + true + + + + + + + + + ..\prebuild GPU + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 32 + + + true + + + + + ..\prebuild GPU + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + + _WIN64 + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 64 + + + true + + + + + true + true + true + + + + + + + + + + + + + + + + true + true + true + true + + + + + + + + + + + + + + + + + + + Full + + + Full + + + Full + + + + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + Document + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/libecm_gpu/libecm_gpu.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,193 @@ + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + {dfe792df-b4ff-4147-be95-190117baae33} + + + {0315d9d5-3f8f-456a-ae54-e00de69b9350} + + + {cbe6b893-95dc-4f4b-b2e9-73245cf57c75} + + + + + Source Files + + + + + Source Files\Assembler + + + Source Files\Assembler + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/libecm_gpu/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/libecm_gpu/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/libecm_gpu/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/libecm_gpu/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = libecm_gpu.vcxproj libecm_gpu.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,7 @@ +EXTRA_DIST = config.h ecm.sln ecm_gpu.sln file_copy.bat gen_ecm_h.bat \ + getopt.c getopt.h getrusage.c getrusage.h gettimeofday.c \ + gettimeofday.h mp_lib.props out_copy_rename.bat prebuild.bat \ + python.bat readme.txt tests.py vacopy.c vsyasm.props \ + vsyasm.targets vsyasm.xml + +DIST_SUBDIRS = assembler ecm ecm_gpu libecm libecm_gpu tune bench_mulredc diff -Nru gmp-ecm-7.0.4+ds/build.vc14/mp_lib.props gmp-ecm-7.0.5+ds/build.vc14/mp_lib.props --- gmp-ecm-7.0.4+ds/build.vc14/mp_lib.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/mp_lib.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,20 @@ + + + + mpir\ + mpir.lib + + + <_ProjectFileVersion>10.0.30128.1 + + + + $(mp_dir) + true + + + $(mp_lib) + true + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/multiecm/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/multiecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/multiecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/multiecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = multiecm.vcxproj multiecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/multiecm/multiecm.vcxproj gmp-ecm-7.0.5+ds/build.vc14/multiecm/multiecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/multiecm/multiecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/multiecm/multiecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,238 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + multiecm + Win32Proj + 8.1 + {16434DC2-371C-451B-A336-820499B98B8C} + + + + Application + v110 + + + Application + v120 + + + Application + v140 + + + Application + v140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + false + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + false + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/multiecm/multiecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/multiecm/multiecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/multiecm/multiecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/multiecm/multiecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,71 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/out_copy_rename.bat gmp-ecm-7.0.5+ds/build.vc14/out_copy_rename.bat --- gmp-ecm-7.0.4+ds/build.vc14/out_copy_rename.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/out_copy_rename.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,31 @@ +@echo off +if not exist %1 goto nofile +if exist %2 goto next + +echo creating directory %2 +md %2 > nul + +:next +rem strip quotes if present +set str=%2 +for /f "useback tokens=*" %%a in ('%str%') do set str=%%~a + +rem add a backslash if the output directory lacks one +set str=%str:~-1% +if "%str%" == "\" (set outf=%2%3) else (set outf=%2\%3) + +echo copying %1 to %outf% (if not present or changed) +if not exist "%outf%" goto copy + +rem don't overwrite if output exists and is not changed +fc %1 %outf% > nul && if not %errorlevel 1 goto exit +echo overwriting %outf% with %1 + +:copy +copy %1 %outf% > nul +goto exit + +:nofile +echo %1 not found + +:exit diff -Nru gmp-ecm-7.0.4+ds/build.vc14/prebuild.bat gmp-ecm-7.0.5+ds/build.vc14/prebuild.bat --- gmp-ecm-7.0.4+ds/build.vc14/prebuild.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/prebuild.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +cd ..\ +call out_copy_rename config.h ..\ config.h +call gen_ecm_h diff -Nru gmp-ecm-7.0.4+ds/build.vc14/python.bat gmp-ecm-7.0.5+ds/build.vc14/python.bat --- gmp-ecm-7.0.4+ds/build.vc14/python.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/python.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,2 @@ +@echo off +"c:\program files\python34\python" %1 diff -Nru gmp-ecm-7.0.4+ds/build.vc14/readme.txt gmp-ecm-7.0.5+ds/build.vc14/readme.txt --- gmp-ecm-7.0.4+ds/build.vc14/readme.txt 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/readme.txt 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,173 @@ + +Building GMP-ECM with Microsoft Visual C++ 2015 +=============================================== + +If you wish to build the assembler code support you will need to +install the YASM assembler that is available at: + + http://www.tortall.net/projects/yasm/ + +THe version you need is vsyasm, which should be put it in the same +directory as your Visual C++ compiler, which is typically: + + C:\Program Files\yasm + +The Multi-Precision Library - GMP and MPIR +========================================== + +GMP-ECM works with either GMP or MPIR, a fork of GMP. To build and run +GMP-ECM using Visual Studio you first need to obtain and build either +GMP or MPIR. MPIR has a fully integrated Visual Studio build system +for Windows but GMP does not. + +The VC++ build of GMP-ECM now defaults to MPIR but the property sheet +mp_lib.vsprops can be edited to set the macro mp_lib to 'gmp' instead +of 'mpir' to build ECM using GMP. + +GMP +=== + +GMP can be built from the GMP source code available here: + + http://gmplib.org/ + +GMP can be built with mingw for 32-bit Windows and mingw64 for Windows x64. +It is reported that the resulting libraries work with Visual Studio when +appropriately renamed. + +MPIR +==== + +MPIR is available here: + + http://www.mpir.org + +It has full support for building MPIR for 32 and 64 bit Windows systems +with x86 assembler support using the YASM assembler. + +Building GMP-ECM +================ + +The build files for GMP-ECM assume that the GMP and ECM build directories +are in a common parent directory as follows: + + Parent Directory + MPIR (or GMP) + build.vc14 -- MPIR (or GMP) build files + ... + GMP-ECM + buid.vc14 -- ECM build files + +The root directories for GMP and GMP-ECM are assumed to have these names +irrespective of which version is being used (they used to be followed by +version numbers but this meant that the build projects had to be updated +too frequently). + +The normal (non GPU) build is opened by loading the file ecm.sln (from +the build.vc14 directory) into Visual Studio. This provides these build +projects in build.vc14 for the non GPU build: + + ecm - the ECM application + ecmlib - the ECM library + tune - a program for tuning + bench_mulredc - for benchmarking mulredc + multiecm - work in progress (not working) + +The GPU build is opened by loading the file ecm.sln (from the build.vc14 +directory) into Visual Studio. This provides two build projects in +build.vc14: + + ecm_gpu - the ECM application + ecmlib_gpu - the ECM library + +In all cases you have to choose either a win32 or x64 build and either a +Release or Debug configuration. + +The non GPU Build +----------------- + +Before starting a build, there are a number of configuration options +that need to be set: + +1. If you wish to compile GMP-ECM for use on a particular processor, + select the appropriate define from the file 'ecm-params.h' in the + GMP-ECM root directory and decide which of the defines suit your + needs (e.g. __tune_corei7__). Then replace the existing define: + + /* define Windows tuning here */ + # define __tune_corei7__ + + towards the end of the file config.h file in the 'build.vc14' + directory (build.vc14\config.h) with the chosen define. + +2. The file at 'build.vc14\mul_fft-params.h' allows the FFT code to + be tuned to 32 or 64-bit systems by selecting an option by + changing the appropriate '#elif 0' to #elif 1'. If you wish to + use the win32 AMD assembler files, you also have to use the + Visual Studio property page to define AMD_ASM (alternatively + you can edit the two files mulredc.asm and redc.asm in the + build.vc14\assembler\ directory to include the AMD assembler). + +The GPU Build +------------- + +1. If you wish to build with a GPU capability you will need to + install Nvidia Nsight for Visual Studio and the CUDA Toolkit + v8.0. You then build the libecm_gpu and ecm_gpu projects + +2. The choices above for the non GPU build aslo apply when + building for a GPU based system. + + By default, the GPU configuration is "compute_50,sm_50". If + you need to change this, select libecm_gpu and ecm_gpu and + set the propertiesfor "CUDA C/C++|Device|Code Generation" for + your GPU capability. + + Also under "C/C++|Preprocessor|Preprocessor Definitions" for + both these projects, change the current definition GPU_CC50 to + that for your GPU capability + +Build Configurations +-------------------- + +When a version of ecm and ecmlib are built, the library and the application +are put in the directory matching the configuration that has been built: + + GMP-ECM + build.vc14 -- ECM build files + lib -- ECM static library files + bin -- ECM executable files + +within these lib, dll and bin directories, the outputs are located in +sub-directories determined by the platform and configuration: + + win32\release + win32\debug + x64\release + x64\debug + +If you don't want assembler support you need to change the define: + +#define NATIVE_REDC 1 + +in config.h (in the build.vc14 subdirectory) to: + +#undef NATIVE_REDC + +Tune +==== + +If tune is compiled and run for a particular configuration it will output +suitable values for optimising GMP-ECM to the console window. To optimise +GMP-ECM these values should be put in a suitably named file whcih then has +to be integrated in ecm-params.h. + +Tests +===== + +The file test.py is a python script that runs the ECM tests. It runs the +x64/release-amd (non GPU) version by default but can be edited to test other +builds. It cannot run some tests as a result of the diifficulty in the +conversion of the Unix shell scripts for the tests for use on Windows. + + Brian Gladman, November 2017 diff -Nru gmp-ecm-7.0.4+ds/build.vc14/tests.py gmp-ecm-7.0.5+ds/build.vc14/tests.py --- gmp-ecm-7.0.4+ds/build.vc14/tests.py 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/tests.py 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,150 @@ + +from __future__ import print_function +import os +import sys +import string +import platform +from re import match +from subprocess import Popen, PIPE, STDOUT +from tempfile import * + +try: + from time import perf_counter as timer +except ImportError: + from time import clock as timer + + +x64 = True +debug = False +test_gpu_version = True +run_non_gpu_tests = True +run_gpu_tests = True + +class Timer() : + def __enter__(self): self.start = timer() + def __exit__(self, *args): print(' time {:.3f} milliseconds'.format(1000 * (timer() - self.start))) + +cpath = os.path.dirname(__file__) +config = 'x64' if x64 else 'Win32' +mode = 'Debug' if debug else 'Release' +test_dir = '..\\bin\\{:s}\\{:s}\\'.format(config, mode) + +def get_tests(filename): + print('running tests in {:s}'.format(filename)) + start, sub, tests, c_tests = True, dict(), [], [] + with open(os.path.join(cpath, filename)) as f: + lines = f.readlines() + cnt, lnth = 0, len(lines) + while cnt < lnth: + try: + line = lines[cnt].strip() + cnt += 1 + tkns = line.split() + if line.startswith('echo') and len(tkns) > 2 and tkns[2] == '|': + while cnt < lnth and 'checkcode' not in line: + while cnt < lnth and not lines[cnt]: + cnt += 1 + if cnt < lnth: + line += '|' + lines[cnt] + cnt += 1 + start = False + elif start: + sp = line.split('="') + if len(sp) == 2: + if sp[1].startswith('${1:-./ecm}'): + sub[sp[0]] = sp[1][12:-1] + else: + sub[sp[0]] = sp[1][:-1] + continue + else: + continue + line = line.replace(';', '|') + sub_tests = [] + for line_part in line.split('|'): + tkns = line_part.strip().split() + cmd = [] + for tok in tkns: + if tok.startswith('"') and tok.endswith('"'): + tok = tok[1:-1] + if tok[0] == '$' and tok[1:] in sub: + tok = tok.replace(tok, sub[tok[1:]]) + elif tok == './ecm': + tok = '' + cmd += [tok] + cseq = [] + if cmd and cmd[0] == 'echo': + cseq += [cmd[1]] + cmd = cmd[2:] + if len(cmd) >= 3 and cmd[-3] == 'checkcode' and cmd[-2] == '$?': + cseq += [int(cmd[-1])] + cmd = cmd[:-3] + cmd = (' '.join(cmd)).strip() + if cmd: + cseq += [cmd] + sub_tests += [cseq] + if len(sub_tests) == 3 and all(len(x) == 1 for x in sub_tests): + tests += [tuple(x[0] for x in sub_tests)] + else: + c_tests += [sub_tests] + except ValueError: + print('parsing error on line {} in text "{}"'.format(cnt, line)) + return tests, c_tests + +def run_exe(exe, args, inp) : + al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT } + if sys.platform.startswith('win'): + al['creationflags'] = 0x08000000 + p = Popen([os.path.join(cpath, exe)] + args.split(' '), **al) + res = p.communicate(inp.encode())[0].decode() + ret = p.poll() + return (ret, res) + +def output_complex_tests(x): + print('these tests are too complex:') + for t in x: + print(t) + +def do_tests(tests, ctests, out=False, gpu=False): + ecm_exe = test_dir + ("ecm_gpu.exe" if gpu else "ecm.exe") + err_cnt = 0 + for ix, tt in enumerate(tests): + print(tt[1], tt[0], end='') + rv = run_exe(ecm_exe, tt[1], tt[0]) + if type(tt[2]) == int and rv[0] != tt[2]: + print(" - *** ERROR in test {:d}: {:d} {:d} ***".format(ix, rv[0], tt[2])) + err_cnt += 1 + elif type(tt[2]) == tuple and rv[0] != tt[2][0] and rv[0] != tt[2][1]: + print(" - *** ERROR in test {:d}: {:d} {:s} ***".format(ix, rv[0], tt[2])) + err_cnt += 1 + else: + print(" - passed") + if out: + op = rv[1].rsplit('\r\n') + for i in op : + print(i) + + if ctests: + output_complex_tests(ctests) + if not err_cnt: + if ctests: + print('all other tests passed') + else: + print('all tests passed') + +with Timer(): + if os.path.exists('test.pm1.save'): + os.remove('test.pm1.save') + if run_non_gpu_tests: + t, ct = get_tests("..\\test.ecm") + do_tests(t, ct) + t, ct = get_tests("..\\test.pm1") + do_tests(t, ct) + t, ct = get_tests("..\\test.pp1") + do_tests(t, ct) + t, ct = get_tests("..\\testlong.pp1") + do_tests(t, ct) + t, ct = get_tests("..\\testlong.pm1") + do_tests(t, ct) + if run_gpu_tests: + t, ct = get_tests("..\\test.gpuecm") + do_tests(t, ct, gpu=True) diff -Nru gmp-ecm-7.0.4+ds/build.vc14/tune/Makefile.am gmp-ecm-7.0.5+ds/build.vc14/tune/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc14/tune/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/tune/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = tune.vcxproj tune.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc14/tune/tune.vcxproj gmp-ecm-7.0.5+ds/build.vc14/tune/tune.vcxproj --- gmp-ecm-7.0.4+ds/build.vc14/tune/tune.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/tune/tune.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,161 @@ + + + + + Release + Win32 + + + Release + x64 + + + + {80E08750-5C6C-492E-BB1E-7200978AE125} + tune + Win32Proj + + + + Application + Unicode + true + v120 + + + Application + NotSet + v140 + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + + + + MaxSpeed + true + ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);%(AdditionalDependencies) + true + Console + true + true + MachineX86 + + + + + + + + + + + + + X64 + + + MaxSpeed + true + ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + + + ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);%(AdditionalDependencies) + true + Console + true + true + MachineX64 + + + + + + + _WIN64 + + + + + + + + + + + + + + + TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) + TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/tune/tune.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc14/tune/tune.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc14/tune/tune.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/tune/tune.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,109 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {38f1a18f-40fc-4eed-a68e-e79b58327b6c} + + + + + Source Files\Assembler + + + Source Files\Assembler + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/vacopy.c gmp-ecm-7.0.5+ds/build.vc14/vacopy.c --- gmp-ecm-7.0.4+ds/build.vc14/vacopy.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/vacopy.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,8 @@ + +#include +#include + +void _vacopy(va_list *pap, va_list ap) +{ + *pap = ap; +} diff -Nru gmp-ecm-7.0.4+ds/build.vc14/vsyasm.props gmp-ecm-7.0.5+ds/build.vc14/vsyasm.props --- gmp-ecm-7.0.4+ds/build.vc14/vsyasm.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/vsyasm.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,27 @@ + + + + Midl + CustomBuild + + + _SelectedFiles;$(YASMDependsOn) + + + c:\program files\yasm\ + + + + False + $(IntDir) + 0 + 0 + "$(YasmPath)"vsyasm.exe -Xvc -f $(Platform) [AllOptions] [AdditionalOptions] [Inputs] + %(ObjectFile) + Assembling %(Filename)%(Extension) + false + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/vsyasm.targets gmp-ecm-7.0.5+ds/build.vc14/vsyasm.targets --- gmp-ecm-7.0.4+ds/build.vc14/vsyasm.targets 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/vsyasm.targets 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,108 @@ + + + + + + _YASM + + + + $(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml + + + + + + + + @(YASM, '|') + + + + + + + + + $(ComputeLinkInputsTargets); + ComputeYASMOutput; + + + $(ComputeLibInputsTargets); + ComputeYASMOutput; + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc14/vsyasm.xml gmp-ecm-7.0.5+ds/build.vc14/vsyasm.xml --- gmp-ecm-7.0.4+ds/build.vc14/vsyasm.xml 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc14/vsyasm.xml 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,283 @@ + + + + + + + + + + + + + General + + + + + + Symbols + + + + + + Files + + + + + + Command Line + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Execute Before + + + Specifies the targets for the build customization to run before. + + + + + + + + + + + + Execute After + + + Specifies the targets for the build customization to run after. + + + + + + + + + + + + + + + + + + Additional Options + + + Additional Options + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/a_win32a_redc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/a_win32a_redc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/a_win32a_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/a_win32a_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,133 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro seq 3 + mov eax, [byte esi+4*%3] + mul ebp + add [byte edi+4*%3], %2 + adc %1, eax + mov %2, edx + adc %2, 0 +%endmacro + + text + global _ecm_redc3 + +_ecm_redc3: + push ebp + push edi + push esi + push ebx + sub esp, 16 + mov ecx, [esp+44] + mov edi, [esp+36] + mov [esp], ecx + cmp ecx, 5 + jae .3 + +.1: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + xor ebx, ebx + +.2: mov eax, [esi] + add edi, 4 + mul ebp + add esi, 4 + add eax, ebx + adc edx, 0 + add [edi-4], eax + adc edx, 0 + dec ecx + mov ebx, edx + jnz .2 + mov edi, [esp+36] + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .1 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + +.3: mov edx, ecx + dec ecx + sub edx, 2 + neg ecx + shr edx, 4 + and ecx, 15 + mov [esp+8], edx + mov edx, ecx + shl edx, 4 + neg ecx + lea edx, [edx+ecx+.6] + mov [esp+44], ecx + mov [esp+12], edx + +.4: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + mov edx, [esp+8] + mov [esp+4], edx + mov eax, [esi] + lea esi, [esi+ecx*4+4] + mul ebp + lea edi, [edi+ecx*4] + mov ebx, edx + mov edx, [esp+12] + test ecx, 1 + mov ecx, eax + cmovnz ecx, ebx + cmovnz ebx, eax + jmp edx + + align 32 +.5: add edi, 64 +.6: + +%assign i 0 +%rep 16 + %if (i & 1) + seq ecx, ebx, i + %else + seq ebx, ecx, i + %endif + %assign i i + 1 +%endrep + + dec dword [esp+4] + lea esi, [esi+64] + jns .5 + + add [edi+64], ecx + mov edi, [esp+36] + adc ebx, 0 + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .4 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + + end + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/a_win32p_mulredc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/a_win32p_mulredc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/a_win32p_mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/a_win32p_mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,148 @@ + +; Part of GMP-ECM +; +; mp_limb_t mulredc1( 1 limb +; mp_limb_t *z, +; const mp_limb_t x, +; const mp_limb_t y, +; const mp_limb_t m, +; mp_limb_t inv_m +; ) +; +; mp_limb_t mulredc( > 1 limb +; mp_limb_t *z, +; const mp_limb_t *x, +; const mp_limb_t *y, +; const mp_limb_t *m, +; mp_limb_t inv_m +; ) + +%macro mseq 1 + movd mm1, [esi+4*%1] + movd mm2, [edi+4*%1] + pmuludq mm1, mm7 + paddq mm2, mm1 + paddq mm0, mm2 + movd [edi+4*%1], mm0 + psrlq mm0, 32 +%endmacro + +%macro mulredc 1 +%assign limbs %1 +%define f_name(x) _mulredc %+ x + + global f_name(limbs) +%ifdef DLL + export f_name(limbs) +%endif + +f_name(limbs): + push ebp + push edi + push esi + push ebx + sub esp, 8*(limbs+1) + mov edi, esp + +%assign i 0 +%rep 2 * limbs + 1 + mov dword [edi+4*i], 0 + %assign i i + 1 +%endrep + + mov dword [esp+8*limbs+4], limbs + + align 32 + +.1: mov eax, [esp+8*limbs+32] + mov esi, [esp+8*limbs+36] + mov eax, [eax] + mul dword [esi] + add eax, [edi] + mul dword [esp+8*limbs+44] + mov ebp, eax + mov esi, [esp+8*limbs+40] + + pxor mm0, mm0 + movd mm7, ebp + +%assign i 0 +%rep limbs + mseq i + %assign i i + 1 +%endrep + + movd ecx, mm0 + + add [edi+4*limbs], ecx + adc dword [edi+4*limbs+4], 0 + mov eax, [esp+8*limbs+32] + mov ebp, [eax] + mov esi, [esp+8*limbs+36] + + pxor mm0, mm0 + movd mm7, ebp + +%assign i 0 +%rep limbs + mseq i + %assign i i + 1 +%endrep + + movd ecx, mm0 + add [edi+4*limbs], ecx + adc dword [edi+4*limbs+4], 0 + add dword [esp+8*limbs+32], 4 + add edi, 4 + dec dword [esp+8*limbs+4] + jnz .1 + + mov ebx, [esp+8*limbs+28] + +%assign i 0 +%rep limbs + mov eax, [edi+4*i] + mov [ebx+4*i], eax + %assign i i + 1 +%endrep + mov eax, [edi+4*limbs] + add esp, 8*(limbs+1) + + pop ebx + pop esi + pop edi + pop ebp + emms + ret +%endmacro + + bits 32 + section .text + + global _mulredc1 +%ifdef DLL + export _mulredc1 +%endif + +_mulredc1: + mov eax, [esp+12] + mul dword [esp+8] + mov [esp+12], edx + mov [esp+8], eax + mul dword [esp+20] + mul dword [esp+16] + add eax, [esp+8] + adc edx, [esp+12] + mov ecx, [esp+4] + mov [ecx], edx + adc eax, 0 + ret + +%assign i 2 +%rep 19 ; 3..20 inclusive + mulredc i + %assign i i + 1 +%endrep + + end + diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/a_win32p_redc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/a_win32p_redc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/a_win32p_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/a_win32p_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,145 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro rloop 3 + mov eax, [byte esi+4*%3] + mul ebp + add [byte edi+4*%3], %2 + adc %1, eax + mov %2, edx + adc %2, 0 +%endmacro + + bits 32 + section .text + + global _ecm_redc3 +%ifdef DLL + export _ecm_redc3 +%endif + +_ecm_redc3: + push ebp + push edi + push esi + push ebx + sub esp, 16 + + mov ecx, [esp+44] + mov edi, [esp+36] + mov [esp], ecx + cmp ecx, 5 + jae .unroll + +.1: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + xor ebx, ebx + +.2: mov eax, [esi] + add edi, 4 + mul ebp + add esi, 4 + add eax, ebx + adc edx, 0 + add [edi-4], eax + adc edx, 0 + dec ecx + mov ebx, edx + jnz .2 + mov edi, [esp+36] + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .1 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + +.unroll: + mov edx, ecx + dec ecx + sub edx, 2 + neg ecx + shr edx, 4 + and ecx, 15 + mov [esp+8], edx + mov edx, ecx + shl edx, 4 + neg ecx + lea edx, [edx+ecx*1+.loop_base] + mov [esp+44], ecx + mov [esp+12], edx + +.4: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + mov edx, [esp+8] + mov [esp+4], edx + mov eax, [esi] + lea esi, [esi+ecx*4+4] + mul ebp + lea edi, [edi+ecx*4] + mov ebx, edx + mov edx, [esp+12] + test ecx, 1 + mov ecx, eax + cmovnz ecx, ebx + cmovnz ebx, eax + jmp edx + + align 32 +.5: add edi, 64 +.loop_base: + rloop ebx, ecx, 0 + rloop ecx, ebx, 1 + rloop ebx, ecx, 2 + rloop ecx, ebx, 3 + rloop ebx, ecx, 4 + rloop ecx, ebx, 5 + rloop ebx, ecx, 6 + rloop ecx, ebx, 7 + rloop ebx, ecx, 8 + rloop ecx, ebx, 9 + rloop ebx, ecx, 10 + rloop ecx, ebx, 11 + rloop ebx, ecx, 12 + rloop ecx, ebx, 13 + rloop ebx, ecx, 14 + rloop ecx, ebx, 15 + + dec dword [esp+4] + lea esi, [esi+64] + jns .5 + + add [edi+64], ecx + mov edi, [esp+36] + adc ebx, 0 + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .4 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + + end diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/a_x64_mulredc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/a_x64_mulredc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/a_x64_mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/a_x64_mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,237 @@ +; +; Part of GMP-ECM +; +; mp_limb_t mulredc1( MSVC 1 limb +; mp_limb_t *z, rcx +; const mp_limb_t x, rdx +; const mp_limb_t y, r8 +; const mp_limb_t m, r9 +; mp_limb_t inv_m [rsp+0x28] +; ) +; +; mp_limb_t mulredc( MSVC > 1 limb +; mp_limb_t *z, rcx +; const mp_limb_t *x, rdx +; const mp_limb_t *y, r8 +; const mp_limb_t *m, r9 +; mp_limb_t inv_m [rsp+0x28] +; ) + +%macro mseq_1 4 + mov %2, rcx + mul r14 + add %1, rax + mov rax, [r9+8*%3] + adc %2, rdx + mul r11 +%if %3 < %4 - 1 + add rax, %1 + mov [rbp+8*(%3-1)], rax + mov rax, [r8+8*(%3+1)] + adc %2, rdx + setc cl +%else + add %1, rax + mov [rbp+8*(%3-1)], %1 + adc %2, rdx + mov [rbp+8*%3], %2 + setc cl + mov [rbp+8*(%3+1)], rcx +%endif +%endmacro + +%macro mseq_20 2 + mov r14, [r13+r12*8] + mov rax, [r8] + mov %1, [rbp] + mov %2, [rbp+8] + mul r14 + add r12, 1 + add rax, %1 + adc %2, rdx + setc cl + mov %1, rax + imul rax, r10 + mov r11, rax + mul qword [r9] + add %1, rax + adc %2, rdx + mov rax, [r8+8] +%endmacro + +%macro mseq_2 4 + mov %2, [rbp+8*(%3+1)] + adc %2, rcx +%if %3 < %4 - 1 + setc cl +%endif + mul r14 + add %1, rax + mov rax, [r9+8*%3] + adc %2, rdx +%if %3 < %4 - 1 + adc cl, 0 +%else + setc cl +%endif + mul r11 +%if %3 < %4 - 1 + add rax, %1 + mov [rbp+8*(%3-1)], rax + adc %2, rdx + mov rax, [r8+8*(%3+1)] +%else + add %1, rax + mov [rbp+8*(%3-1)], %1 + adc %2, rdx + mov [rbp+8*%3],%2 + adc cl, 0 + mov [rbp+8*(%3+1)], rcx +%endif +%endmacro + +%macro store 1 +%assign i 0 +%rep %1 + %if i == %1 - 1 && (%1 & 1) + mov rax, [rbp+8*i] + mov [rdi+8*i], rax + %elif (i & 1) + mov [rdi+8*(i-1)], rax + mov [rdi+8*i], rdx + %else + mov rax, [rbp+8*i] + mov rdx, [rbp+8*(i+1)] + %endif + %assign i i + 1 +%endrep +%endmacro + +%macro mulredc 1 + +%assign limbs %1 +%define f_name(x) mulredc %+ x +%define stack_space 8 * (limbs + 1 + (limbs & 1)) + + global f_name(limbs) +%ifdef DLL + export f_name(limbs) +%endif + + align 64 + +PROC_FRAME f_name(limbs) ; SEH Frame + push_reg rbp + push_reg rbx + push_reg rsi + push_reg rdi + push_reg r12 + push_reg r13 + push_reg r14 + alloc_stack stack_space +END_PROLOGUE + ; *y in r8 + mov rdi, rcx ; *z -> rdi + mov r13, rdx ; *x -> r13 + mov r10, [rsp+8*12+stack_space] ; invm -> r10 + ; *m in r9 + mov r14, [r13] + mov rax, [r8] + xor rcx, rcx + lea rbp, [rsp] + mov r12, rcx + mul qword r14 + add r12, 1 + mov rsi, rax + mov rbx, rdx + imul rax, r10 + mov r11, rax + mul qword [r9] + add rsi, rax + mov rax, [r8+8] + adc rbx, rdx + setc cl + +%assign j 1 +%rep limbs - 1 +%if (j & 1) + mseq_1 rbx, rsi, j, limbs +%else + mseq_1 rsi, rbx, j, limbs +%endif + %assign j j + 1 +%endrep + + align 32 +.1: + +%assign j 1 +%if (limbs & 1) + mseq_20 rsi, rbx + %rep limbs - 1 + %if (j & 1) + mseq_2 rbx, rsi, j, limbs + %else + mseq_2 rsi, rbx, j, limbs + %endif + %assign j j + 1 + %endrep +%else + mseq_20 rbx, rsi + %rep limbs - 1 + %if (j & 1) + mseq_2 rsi, rbx, j, limbs + %else + mseq_2 rbx, rsi, j, limbs + %endif + %assign j j + 1 + %endrep +%endif + + cmp r12, limbs + jb .1 + + store limbs + + mov rax, rcx + add rsp, stack_space + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + pop rbp + ret +ENDPROC_FRAME +%endmacro + + bits 64 + section .text + + global mulredc1 +%ifdef DLL + export mulredc1 +%endif + + align 64 +mulredc1: + mov rax, r8 + mul rdx + mov r10, rax + mov r11, rdx + mul qword [rsp+0x28] + mul r9 + add rax, r10 + adc rdx, r11 + mov [rcx], rdx + adc rax, 0 + ret + +%assign i 2 +%rep 19 ; 2..20 inclusive + mulredc i + %assign i i + 1 +%endrep + + end diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/a_x64_redc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/a_x64_redc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/a_x64_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/a_x64_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,161 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro rloop 3 + mov rax,[byte rsi+8*%3] + mul rbp + add [byte rdi+8*%3], %1 + adc %2, rax + mov %1, rdx + adc %1, 0 +%endmacro + + bits 64 + section .text + + global ecm_redc3 +%ifdef DLL + export ecm_redc3 +%endif + +PROC_FRAME ecm_redc3 + push_reg rbp + push_reg rbx + push_reg rsi + push_reg rdi + alloc_stack 5*8 +END_PROLOGUE + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + + mov r8, rdi + mov r9, rsi + mov r10, rdx + mov r11, rcx + + mov rcx, r10 + mov [rsp], rcx + cmp rcx, 3 + jae .unroll + +.1: mov rbp, r11 + mov rsi, r9 + imul rbp, [rdi] + mov r8, rdi + mov rcx, r10 + xor rbx, rbx + +.2: mov rax, [rsi] + add rdi, 8 + mul rbp + add rsi, 8 + add rax, rbx + adc rdx, 0 + add [rdi-8], rax + adc rdx, 0 + dec rcx + mov rbx, rdx + jnz .2 + mov rdi, r8 + mov [rdi], rbx + dec qword [rsp] + lea rdi, [rdi+8] + jnz .1 + + add rsp, 5*8 + pop rdi + pop rsi + pop rbx + pop rbp + ret + +.unroll: + mov rdx, rcx + dec rcx + sub rdx, 2 + neg rcx + shr rdx, 4 + and rcx, 15 + mov [rsp+16], rdx + mov rdx, rcx + shl rdx, 4 + lea r10, [.loop_base wrt rip] + add rdx, r10 + lea rdx, [rdx+rcx*4] + add rdx, rcx + neg rcx + mov r10, rcx + mov [rsp+24], rdx + +.4: mov rbp, r11 + mov rsi, r9 + imul rbp, [rdi] + mov r8, rdi + mov rcx, r10 + mov rdx, [rsp+16] + mov [rsp+8], rdx + + mov rax, [rsi] + lea rsi, [rsi+rcx*8+8] + mul rbp + lea rdi, [rdi+rcx*8] + mov rbx, rdx + + mov rdx, [rsp+24] + test rcx, 1 + mov rcx, rax + cmovnz rcx, rbx + cmovnz rbx, rax + jmp rdx + + align 64 + +.5: add rdi, 128 +.loop_base: + rloop rcx, rbx, 0 + rloop rbx, rcx, 1 + rloop rcx, rbx, 2 + rloop rbx, rcx, 3 + rloop rcx, rbx, 4 + rloop rbx, rcx, 5 + rloop rcx, rbx, 6 + rloop rbx, rcx, 7 + rloop rcx, rbx, 8 + rloop rbx, rcx, 9 + rloop rcx, rbx, 10 + rloop rbx, rcx, 11 + rloop rcx, rbx, 12 + rloop rbx, rcx, 13 + rloop rcx, rbx, 14 + rloop rbx, rcx, 15 + + dec qword [rsp+8] + lea rsi, [rsi+128] + jns .5 + + add [rdi+128], rcx + mov rdi, r8 + adc rbx, 0 + mov [rdi], rbx + dec qword [rsp] + lea rdi, [rdi+8] + jnz .4 + + add rsp, 5*8 + pop rdi + pop rsi + pop rbx + pop rbp + ret +ENDPROC_FRAME + + end diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/assembler/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/assembler/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +EXTRA_DIST = a_win32a_mulredc.asm a_win32a_redc.asm a_win32p_mulredc.asm \ + a_win32p_redc.asm a_x64_mulredc.asm a_x64_redc.asm \ + test_mulredc.c mulredc.h mulredc.asm redc.asm diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/mulredc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/mulredc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,8 @@ + +%ifdef _WIN64 +%include "a_x64_mulredc.asm" +%elifdef AMD_ASM +%include "a_win32a_mulredc.asm" +%else +%include "a_win32p_mulredc.asm" +%endif diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/mulredc.h gmp-ecm-7.0.5+ds/build.vc15/assembler/mulredc.h --- gmp-ecm-7.0.4+ds/build.vc15/assembler/mulredc.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/mulredc.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,32 @@ +#ifndef __ASM_REDC_H__ +#define __ASM_REDC_H__ + +#include + +extern void ecm_redc3(mp_limb_t *cp, const mp_limb_t *np, mp_size_t nn, mp_limb_t Nprim); + + +/* WARNING: the size-1 version doesn't take pointers in input */ +extern mp_limb_t mulredc1(mp_limb_t *z, mp_limb_t x, mp_limb_t y, mp_limb_t m, mp_limb_t inv_m); + +extern mp_limb_t mulredc2(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc3(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc4(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc5(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc6(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc7(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc8(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc9(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc10(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc11(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc12(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc13(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc14(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc15(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc16(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc17(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc18(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc19(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc20(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/redc.asm gmp-ecm-7.0.5+ds/build.vc15/assembler/redc.asm --- gmp-ecm-7.0.4+ds/build.vc15/assembler/redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,7 @@ +%ifdef _WIN64 +%include "a_x64_redc.asm" +%elif AMD_ASM +%include "a_win32a_redc.asm" +%else +%include "a_win32p_redc.asm" +%endif diff -Nru gmp-ecm-7.0.4+ds/build.vc15/assembler/test_mulredc.c gmp-ecm-7.0.5+ds/build.vc15/assembler/test_mulredc.c --- gmp-ecm-7.0.4+ds/build.vc15/assembler/test_mulredc.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/assembler/test_mulredc.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,303 @@ +#include +#include +#include + +#include + +#include "asmredc.h" + +void mp_print(mp_limb_t *x, int N) { + int i; + for (i = 0; i < N-1; ++i) + printf("%lu + W*(", x[i]); + printf("%lu", x[N-1]); + for (i = 0; i < N-1; ++i) + printf(")"); + printf("\n"); +} + +static mp_limb_t +call_mulredc (int N, mp_limb_t *z, mp_limb_t *x, mp_limb_t *y, mp_limb_t *m, + mp_limb_t invm) +{ + mp_limb_t cy; + + switch (N) + { + case 1: + cy = mulredc1(z, x[0], y[0], m[0], invm); + break; + case 2: + cy = mulredc2(z, x, y, m, invm); + break; + case 3: + cy = mulredc3(z, x, y, m, invm); + break; + case 4: + cy = mulredc4(z, x, y, m, invm); + break; + case 5: + cy = mulredc5(z, x, y, m, invm); + break; + case 6: + cy = mulredc6(z, x, y, m, invm); + break; + case 7: + cy = mulredc7(z, x, y, m, invm); + break; + case 8: + cy = mulredc8(z, x, y, m, invm); + break; + case 9: + cy = mulredc9(z, x, y, m, invm); + break; + case 10: + cy = mulredc10(z, x, y, m, invm); + break; + case 11: + cy = mulredc11(z, x, y, m, invm); + break; + case 12: + cy = mulredc12(z, x, y, m, invm); + break; + case 13: + cy = mulredc13(z, x, y, m, invm); + break; + case 14: + cy = mulredc14(z, x, y, m, invm); + break; + case 15: + cy = mulredc15(z, x, y, m, invm); + break; + case 16: + cy = mulredc16(z, x, y, m, invm); + break; + case 17: + cy = mulredc17(z, x, y, m, invm); + break; + case 18: + cy = mulredc18(z, x, y, m, invm); + break; + case 19: + cy = mulredc19(z, x, y, m, invm); + break; + case 20: + cy = mulredc20(z, x, y, m, invm); + break; + default: + cy = mulredc20(z, x, y, m, invm); + } + return cy; +} + +void test(mp_size_t N, int k) +{ + mp_limb_t *x, *y, *yp, *z, *m, invm, cy, cy2, *tmp, *tmp2, *tmp3; + int i, j; + + x = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + y = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + z = (mp_limb_t *) malloc((N+1)*sizeof(mp_limb_t)); + m = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + tmp = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + tmp2 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + tmp3 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + + if (x == NULL || y == NULL || z == NULL || m == NULL || tmp == NULL || + tmp2 == NULL || tmp3 == NULL) + { + fprintf (stderr, "Cannot allocate memory in test_mulredc\n"); + exit (1); + } + + mpn_random2(m, N); + m[0] |= 1UL; + if (m[N-1] == 0) + m[N-1] = 1UL; + + invm = 1UL; + for (i = 0; i < 10; ++i) + invm = (2*invm-m[0]*invm*invm); + invm = -invm; + + assert( (invm*m[0] +1UL) == 0UL); + + yp = y; + for (i=0; i < k; ++i) { + /* Try a few special cases */ + if (i == 0) + { + /* Try all 0, product should be 0 */ + for (j = 0; j < N; j++) + x[j] = y[j] = 0; + } + else if (i == 1) + { + /* Try all 1 */ + for (j = 0; j < N; j++) + x[j] = y[j] = 1; + } + else if (i == 2) + { + /* Try all 2^wordsize - 1 */ + for (j = 0; j < N; j++) + x[j] = y[j] = ~(0UL); + } + else + { + /* In the other cases, try random data */ + if (i % 2 == 0) + { + /* Try squaring */ + mpn_random2(x, N); + yp = x; + } + else + { + /* Try multiplication */ + mpn_random2(x, N); + mpn_random2(y, N); + } + } + + // Mul followed by ecm_redc3 + mpn_mul_n(tmp, x, yp, N); + ecm_redc3(tmp, m, N, invm); + cy2 = mpn_add_n (tmp2, tmp + N, tmp, N); + + // Mixed mul and redc + cy = call_mulredc (N, z, x, yp, m, invm); + + if (cy != cy2) + printf ("i = %d: mulredc cy = %ld, mpn_mul_n/ecm_redc3 cy = %ld\n", + i, (long) cy, (long) cy2); + assert (cy == cy2); + if (mpn_cmp(z,tmp2, N) != 0) + { + printf ("i = %d\nmulredc = ", i); + for (j = N - 1; j >= 0; j--) + printf ("%lx ", z[j]); + printf ("\nmpn_mul_n/ecm_redc3 = "); + for (j = N - 1; j >= 0; j--) + printf ("%lx ", tmp2[j]); + printf ("\n"); + assert (mpn_cmp(z,tmp2, N) == 0); + } + + if (cy) + printf("!"); + z[N] = cy; + // Check with pure gmp : multiply by 2^(N*GMP_NUMB_BITS) and compare. + for (j=0; j < N; ++j) { + tmp[j] = 0; + tmp[j+N] = z[j]; + } + tmp[2*N] = z[N]; + mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N+1, m, N); + for (j=0; j < N; ++j) + z[j] = tmp3[j]; + + mpn_mul_n(tmp, x, yp, N); + mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N, m, N); + + assert(mpn_cmp(z, tmp3, N) == 0); + } + + free(tmp); free(tmp2); free(tmp3); + free(x); free(y); free(z); free(m); +} + + + +int main(int argc, char** argv) +{ + int i, len; + + if (argc > 1) /* Test a specific length */ + { + len = atoi (argv[1]); + for (i = 0; i < 1; i++) + test (len, 1000000); + return 0; + } + + for (;;) { + for (i = 1; i <= 20; ++i) { + test(i, 1000); + } +#if 0 + test(1, 1000); + test(2, 1000); + test(3, 1000); + test(4, 1000); + test(5, 1000); + test(6, 1000); + test(7, 1000); + test(8, 1000); + test(9, 1000); + test(10, 1000); + test(11, 1000); + test(12, 1000); + test(13, 100); + test(14, 100); + test(15, 100); + test(16, 100); + test(17, 100); + test(18, 100); + test(44, 10); + test(45, 10); + test(46, 10); + test(47, 10); + test(48, 10); + test(49, 10); +#endif + printf("."); fflush(stdout); + } +#if 0 + x[0] = 12580274668139321508UL; + x[1] = 9205793975152560417UL; + x[2] = 7857372727033793057UL; + + y[0] = 13688385828267279103UL; + y[1] = 10575011835742767258UL; + y[2] = 8802048318027595690UL; + + + m[0] = 2981542467342508025UL; + m[1] = 5964669706257742025UL; + m[2] = 18446744073678090270UL; + + invm = 9419286575570128311UL; + + carry = mulredc(z, x, y, m, 3, invm); + + printf("%lu + 2^64*(%lu + 2^64*%lu), carry=%lu\n", z[0], z[1], z[2], carry); +#endif + return 0; +} + + +#if 0 + +W := 2^64; + +x0:= 12580274668139321508; +x1:= 9205793975152560417; +x2:= 7857372727033793057; +x := x0 + W*(x1 + W*x2); + +y0:= 13688385828267279103; +y1:= 10575011835742767258; +y2:= 8802048318027595690; +y := y0 + W*(y1 + W*y2); + +m0:= 2981542467342508025; +m1:= 5964669706257742025; +m2:= 18446744073678090270; +m := m0 + W*(m1 + W*m2); + +invm := 9419286575570128311; + + + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj gmp-ecm-7.0.5+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,171 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {4727DE12-787D-432D-B166-BF103B0C3C87} + Win32Proj + bench_mulredc + 10.0.16299.0 + + + + Application + true + v141 + + + Application + true + v141 + + + Application + false + true + v141 + + + Application + false + true + v141 + + + + + + + + + + + + + + + + + + + + + + + true + $(SolutionDir)..bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + true + $(SolutionDir)..bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreadedDebug + + + Console + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + + + Level3 + Disabled + _WIN64;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreadedDebug + + + Console + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreaded + + + Console + true + true + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + _WIN64;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreaded + + + Console + true + true + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/bench_mulredc/bench_mulredc.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,23 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/bench_mulredc/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/bench_mulredc/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/bench_mulredc/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/bench_mulredc/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = bench_mulredc.vcxproj bench_mulredc.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/config.h gmp-ecm-7.0.5+ds/build.vc15/config.h --- gmp-ecm-7.0.4+ds/build.vc15/config.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/config.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,246 @@ +/* config.h.in. Generated from configure.in by autoheader. */ + +#define VERSION ECM_VERSION + +#define VERSION_GPU "gpu_ecm-win" + +#define PACKAGE_BUGREPORT "ecm-discuss@inria.fr" + +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP + systems. This function is required for `alloca.c' support on those systems. + */ +#undef CRAY_STACKSEG_END + +/* Define to 1 if using `alloca.c'. */ +#define C_ALLOCA 1 + +/* Define to 1 if you have the `access' function. */ +#undef HAVE_ACCESS + +/* Define to 1 if you have `alloca', as a function or macro. */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#undef HAVE_ALLOCA_H + +/* Define to 1 if you have the `ctime' function. */ +#define HAVE_CTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_CTYPE_H 1 + +/* Define to 1 if you have the `floor' function. */ +#define HAVE_FLOOR 1 + +/* Define to 1 if you have the `fmod' function. */ +#define HAVE_FMOD 1 + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `gettimeofday' function. */ +#undef HAVE_GETTIMEOFDAY + +/* Define to 1 if you have the header file. */ +#define HAVE_GMP_H 1 + +/* Define to 1 if gwnum.a or gwnum.lib exist */ +#undef HAVE_GWNUM + +/* Define to 1 if you have the header file. */ +#undef HAVE_INTTYPES_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_IO_H + +/* Define to 1 if you have the `isascii' function. */ +#undef HAVE_ISASCII + +/* Define to 1 if you have the `isdigit' function. */ +#define HAVE_ISDIGIT 1 + +/* Define to 1 if you have the `isspace' function. */ +#define HAVE_ISSPACE 1 + +/* Define to 1 if you have the `isxdigit' function. */ +#define HAVE_ISXDIGIT 1 + +/* Define to 1 if you have the `m' library (-lm). */ +#undef HAVE_LIBM + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_H 1 + +/* Define to 1 if you have the `malloc_usable_size' function. */ +#undef HAVE_MALLOC_USABLE_SIZE + +/* Define to 1 if you have the header file. */ +#define HAVE_MATH_H 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `nice' function. */ +#undef HAVE_NICE + +/* Define to 1 if you have the `pow' function. */ +#define HAVE_POW 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SIGNAL_H 1 + +/* Define to 1 if you have the `sqrt' function. */ +#define HAVE_SQRT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strlen' function. */ +#define HAVE_STRLEN 1 + +/* Define to 1 if you have the `strncasecmp' function. */ +#undef HAVE_STRNCASECMP + +/* Define to 1 if you have the `strstr' function. */ +#undef HAVE_STRSTR + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_RESOURCE_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `time' function. */ +#undef HAVE_TIME + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Define to 1 if you have the `unlink' function. */ +#define HAVE_UNLINK 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_WINDOWS_H 1 + +/* Define to 1 if you have the `__gmpn_add_nc' function. */ +#if defined( _WIN64 ) +# define HAVE___GMPN_ADD_NC 1 +#endif + +/* Define to 1 if you have the `__gmpn_mod_34lsub1' function. */ +#define HAVE___GMPN_MOD_34LSUB1 1 + +/* Define to 1 if you have the `__gmpn_mul_fft' function. */ +#define HAVE___GMPN_MUL_FFT 1 + +/* Define to 1 if you want memory debugging */ +#undef MEMORY_DEBUG + +/* Define if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 +#define HAVE_LONG_LONG_INT 1 + +/* Define to 1 to use asm redc on x86 or x86_64 */ +# define NATIVE_REDC 1 + +/* Define to 1 if your C compiler doesn't accept -c and -o together. */ +#undef NO_MINUS_C_MINUS_O + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at runtime. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ +#undef STACK_DIRECTION + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#undef TIME_WITH_SYS_TIME + +/* Define to 1 if you want assertions enabled */ +#undef WANT_ASSERT + +/* Define to 1 if you want shell command execution */ +#undef WANT_SHELLCMD + +/* Define to empty if `const' does not conform to ANSI C. */ +#undef const + +/* How to specify hot-spot attribute, if available */ +#define ATTRIBUTE_HOT + +#define HAVE___GMPN_REDC_1 1 + +#define HAVE___GMPN_REDC_2 1 + +#define HAVE_ASM_REDC3 1 + +#define WINDOWS64_ABI 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#define inline __inline +#endif + +/* Define to `unsigned int' if does not define. */ +#undef size_t + +#define PRIdSIZE "Id" +#define PRIuSIZE "Iu" + +#ifdef _MSC_VER + +#define __func__ __FUNCTION__ + +/* define Windows tuning here */ +# define __tune_corei7__ + +# if _MSC_VER < 1600 +# define int64_t __int64 +# define uint64_t unsigned __int64 +# endif +# define strncasecmp strnicmp +# define access _access +# define alloca _alloca +# define fseek64 _fseek64 +# define ftell64 _ftell64 +# define omp_get_thread_limit omp_get_max_threads +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm/ecm.vcxproj gmp-ecm-7.0.5+ds/build.vc15/ecm/ecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/ecm/ecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm/ecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,240 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} + ecm + Win32Proj + 10.0.16299.0 + + + + Application + v141 + + + Application + v141 + + + Application + v141 + + + Application + v141 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + false + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + false + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm/ecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/ecm/ecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/ecm/ecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm/ecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,74 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/ecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/ecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = ecm.vcxproj ecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,280 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {1B353D8B-9808-4EB3-A5E7-075D751757AD} + ecm_gpu + Win32Proj + 10.0.17134.0 + + + + Application + v141 + + + Application + v141 + + + Application + v141 + + + Application + v141 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\lib\$(Platform)\cudart.lib + false + Console + true + true + false + + + NotSet + + + compute_50,sm_50 + + + 32 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + NotSet + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib\$(Platform)\cudart.lib + true + Console + false + + + NotSet + + + compute_50,sm_50 + + + 32 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib\$(Platform)\cudart.lib + true + Console + false + + + NotSet + 8388608 + 65536 + + + compute_50,sm_50 + + + 64 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\lib\$(Platform)\cudart.lib + true + + + compute_50,sm_50 + + + 64 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + true + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/ecm_gpu.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,78 @@ + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + {2a13feaf-0c0e-469a-8047-82c647322da9} + + + {163547c7-89d7-4ddc-b0ad-02b4cfd722b4} + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/ecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/ecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/ecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/ecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,68 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = ecm_gpu.vcxproj ecm_gpu.vcxproj.filters libecm_gpu.vcxproj libecm_gpu.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu.sln gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu.sln --- gmp-ecm-7.0.4+ds/build.vc15/ecm_gpu.sln 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm_gpu.sln 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,39 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm_gpu", "libecm_gpu\libecm_gpu.vcxproj", "{3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm_gpu", "ecm_gpu\ecm_gpu.vcxproj", "{1B353D8B-9808-4EB3-A5E7-075D751757AD}" + ProjectSection(ProjectDependencies) = postProject + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} = {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|Win32.ActiveCfg = Debug|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|Win32.Build.0 = Debug|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|x64.ActiveCfg = Debug|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|x64.Build.0 = Debug|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|Win32.ActiveCfg = Release|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|Win32.Build.0 = Release|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|x64.ActiveCfg = Release|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|x64.Build.0 = Release|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|Win32.ActiveCfg = Debug|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|Win32.Build.0 = Debug|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|x64.ActiveCfg = Debug|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|x64.Build.0 = Debug|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|Win32.ActiveCfg = Release|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|Win32.Build.0 = Release|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|x64.ActiveCfg = Release|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru gmp-ecm-7.0.4+ds/build.vc15/ecm.sln gmp-ecm-7.0.5+ds/build.vc15/ecm.sln --- gmp-ecm-7.0.4+ds/build.vc15/ecm.sln 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/ecm.sln 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,70 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.24720.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm", "libecm\libecm.vcxproj", "{CD555681-D65B-4173-A29C-B8BF06A4871B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm", "ecm\ecm.vcxproj", "{C0E2EA85-996A-4B5F-AD30-590FAF5B7187}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tune", "tune\tune.vcxproj", "{80E08750-5C6C-492E-BB1E-7200978AE125}" + ProjectSection(ProjectDependencies) = postProject + {CD555681-D65B-4173-A29C-B8BF06A4871B} = {CD555681-D65B-4173-A29C-B8BF06A4871B} + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} = {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bench_mulredc", "bench_mulredc\bench_mulredc.vcxproj", "{4727DE12-787D-432D-B166-BF103B0C3C87}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "multiecm", "multiecm\multiecm.vcxproj", "{16434DC2-371C-451B-A336-820499B98B8C}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.ActiveCfg = Debug|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.Build.0 = Debug|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.ActiveCfg = Debug|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.Build.0 = Debug|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.ActiveCfg = Release|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.Build.0 = Release|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.ActiveCfg = Release|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.Build.0 = Release|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.ActiveCfg = Debug|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.Build.0 = Debug|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.ActiveCfg = Debug|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.Build.0 = Debug|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.ActiveCfg = Release|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.Build.0 = Release|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.ActiveCfg = Release|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.Build.0 = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|Win32.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.Build.0 = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.ActiveCfg = Release|Win32 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.Build.0 = Release|Win32 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.Build.0 = Release|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.ActiveCfg = Debug|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.Build.0 = Debug|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.ActiveCfg = Debug|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.Build.0 = Debug|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.ActiveCfg = Release|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.Build.0 = Release|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.ActiveCfg = Release|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.Build.0 = Release|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|Win32.ActiveCfg = Debug|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|Win32.Build.0 = Debug|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|x64.ActiveCfg = Debug|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|x64.Build.0 = Debug|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|Win32.ActiveCfg = Release|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|Win32.Build.0 = Release|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|x64.ActiveCfg = Release|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru gmp-ecm-7.0.4+ds/build.vc15/file_copy.bat gmp-ecm-7.0.5+ds/build.vc15/file_copy.bat --- gmp-ecm-7.0.4+ds/build.vc15/file_copy.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/file_copy.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,4 @@ +if not exist %1 ( echo file_copy failure: %1 not found && goto exit ) +if exist %2 ( fc %1 %2 > nul && if not %errorlevel 1 goto exit ) +echo copying %1 to %2 && copy %1 %2 +:exit diff -Nru gmp-ecm-7.0.4+ds/build.vc15/gen_ecm_h.bat gmp-ecm-7.0.5+ds/build.vc15/gen_ecm_h.bat --- gmp-ecm-7.0.4+ds/build.vc15/gen_ecm_h.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/gen_ecm_h.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,13 @@ +@echo off +echo creating ecm.h from ecm.h.in +echo /* generated from ecm-h.in by gen_ecm_h.bat */>tmp.h + +for /f "tokens=1,2*" %%a in (..\ecm.h.in) do ( + if "%%a" EQU "#undef" ( + if "%%b" EQU "ECM_VERSION" ( + echo #define ECM_VERSION "7.0.5">>tmp.h + ) + ) else echo %%a %%b %%c>>tmp.h +) + +call out_copy_rename tmp.h ..\ ecm.h diff -Nru gmp-ecm-7.0.4+ds/build.vc15/getopt.c gmp-ecm-7.0.5+ds/build.vc15/getopt.c --- gmp-ecm-7.0.4+ds/build.vc15/getopt.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/getopt.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,1281 @@ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001,2002 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ + +#define HAVE_STRING_H 1 + +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + +#ifdef HAVE_CONFIG_H +# include +#endif + +#if !defined __STDC__ || !__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. */ +# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC +# include +# ifndef _ +# define _(msgid) gettext (msgid) +# endif +# else +# define _(msgid) (msgid) +# endif +# if defined _LIBC && defined USE_IN_LIBIO +# include +# endif +#endif + +#ifndef attribute_hidden +# define attribute_hidden +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + +#include "getopt.h" + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized attribute_hidden; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# include +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined __STDC__ || !__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not __STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Stored original parameters. + XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ +extern int __libc_argc; +extern char **__libc_argv; + +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +# ifdef USE_NONOPTION_FLAGS +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; +# endif + +# ifdef USE_NONOPTION_FLAGS +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +# else +# define SWAP_FLAGS(ch1, ch2) +# endif +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined __STDC__ && __STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined __STDC__ && __STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + if (posixly_correct == NULL + && argc == __libc_argc && argv == __libc_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int *longind; + int long_only; +{ + int print_errors = opterr; + if (optstring[0] == ':') + print_errors = 0; + + if (argc < 1) + return -1; + + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#if defined _LIBC && defined USE_NONOPTION_FLAGS +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else if (long_only + || pfound->has_arg != p->has_arg + || pfound->flag != p->flag + || pfound->val != p->val) + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]) >= 0) + { + + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); +#endif + } + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (argv[optind - 1][1] == '-') + { + /* --option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("\ +%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); +#else + fprintf (stderr, _("\ +%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); +#endif + } + else + { + /* +option or -option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("\ +%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], + pfound->name); +#else + fprintf (stderr, _("\ +%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); +#endif + } + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (argv[optind][1] == '-') + { + /* --option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); +#else + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); +#endif + } + else + { + /* +option or -option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); +#else + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (posixly_correct) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: illegal option -- %c\n"), + argv[0], c); +#else + fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c); +#endif + } + else + { +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: invalid option -- %c\n"), + argv[0], c); +#else + fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, + _("%s: option requires an argument -- %c\n"), + argv[0], c) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); +#endif + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); +#endif + } + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); +#endif + } + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); +#endif + } + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option requires an argument -- %c\n"), + argv[0], c) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); +#endif + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int *) 0, + 0); +} + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +/* Compile with -DTEST to make an executable for use in testing + the above definition of `getopt'. */ + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + + c = getopt (argc, argv, "abc:d:0123456789"); + if (c == -1) + break; + + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ + diff -Nru gmp-ecm-7.0.4+ds/build.vc15/getopt.h gmp-ecm-7.0.5+ds/build.vc15/getopt.h --- gmp-ecm-7.0.4+ds/build.vc15/getopt.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/getopt.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,182 @@ +/* Declarations for getopt. + Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _GETOPT_H + +#ifndef __need_getopt +# define _GETOPT_H 1 +#endif + +/* If __GNU_LIBRARY__ is not already defined, either we are being used + standalone, or this is the first header included in the source file. + If we are being used with glibc, we need to include , but + that does not exist if we are standalone. So: if __GNU_LIBRARY__ is + not defined, include , which will pull in for us + if it's from glibc. (Why ctype.h? It's guaranteed to exist and it + doesn't flood the namespace with stuff the way some other headers do.) */ +#if !defined __GNU_LIBRARY__ +# include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#ifndef __need_getopt +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +# if (defined __STDC__ && __STDC__) || defined __cplusplus + const char *name; +# else + char *name; +# endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +# define no_argument 0 +# define required_argument 1 +# define optional_argument 2 +#endif /* need getopt */ + + +/* Get definitions and prototypes for functions to process the + arguments in ARGV (ARGC of them, minus the program name) for + options given in OPTS. + + Return the option character from OPTS just read. Return -1 when + there are no more options. For unrecognized options, or options + missing arguments, `optopt' is set to the option letter, and '?' is + returned. + + The OPTS string is a list of characters which are recognized option + letters, optionally followed by colons, specifying that that letter + takes an argument, to be placed in `optarg'. + + If a letter in OPTS is followed by two colons, its argument is + optional. This behavior is specific to the GNU `getopt'. + + The argument `--' causes premature termination of argument + scanning, explicitly telling `getopt' that there are no more + options. + + If OPTS begins with `--', then non-option arguments are treated as + arguments to the option '\0'. This behavior is specific to the GNU + `getopt'. */ + +#if (defined __STDC__ && __STDC__) || defined __cplusplus +# ifdef __GNU_LIBRARY__ +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int ___argc, char *const *___argv, const char *__shortopts); +# else /* not __GNU_LIBRARY__ */ +extern int getopt (); +# endif /* __GNU_LIBRARY__ */ + +# ifndef __need_getopt +extern int getopt_long (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); +extern int getopt_long_only (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind, + int __long_only); +# endif +#else /* not __STDC__ */ +extern int getopt (); +# ifndef __need_getopt +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +# endif +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +/* Make sure we later can get all the definitions and declarations. */ +#undef __need_getopt + +#endif /* getopt.h */ + diff -Nru gmp-ecm-7.0.4+ds/build.vc15/getrusage.c gmp-ecm-7.0.5+ds/build.vc15/getrusage.c --- gmp-ecm-7.0.4+ds/build.vc15/getrusage.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/getrusage.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,71 @@ +/* This file is part of the MPIR Library. + + The MPIR Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, or (at + your option) any later version. + The MPIR Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + You should have received a copy of the GNU Lesser General Public License + along with the MPIR Library; see the file COPYING.LIB. If not, write + to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#define WIN32_LEAN_AND_MEAN + +#include +#include +#include + +#include "getrusage.h" + +typedef union file_t +{ FILETIME ft; + long long lt; +} file_t; + +int getrusage(int who, rusage *usage) +{ + HANDLE proc_hand; + file_t c_time, x_time, s_time, u_time; + int cb = 0, err = -1; + + if(who != RUSAGE_SELF) + { + errno = (who == RUSAGE_CHILDREN ? ENODATA : EINVAL); + return err; + } + + proc_hand = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, GetCurrentProcessId()); + + if(GetProcessTimes(proc_hand, &(c_time.ft), &(x_time.ft), &(s_time.ft), &(u_time.ft))) + { + PROCESS_MEMORY_COUNTERS ctrs; + + /* The units returned by GetProcessTimes are 100 nanoseconds */ + u_time.lt = (u_time.lt + 5) / 10; + s_time.lt = (s_time.lt + 5) / 10; + + usage->ru_utime.tv_sec = (long)(u_time.lt / 1000000ll); + usage->ru_stime.tv_sec = (long)(s_time.lt / 1000000ll); + usage->ru_utime.tv_usec = (long)(u_time.lt % 1000000ll); + usage->ru_stime.tv_usec = (long)(s_time.lt % 1000000ll); + + if(GetProcessMemoryInfo(proc_hand, &ctrs, sizeof(ctrs))) + { + PERFORMANCE_INFORMATION perf_info; + GetPerformanceInfo(&perf_info, sizeof(perf_info)); + usage->ru_maxrss = (DWORD) (ctrs.WorkingSetSize / perf_info.PageSize); + usage->ru_majflt = ctrs.PageFaultCount; + err = 0; + } + } + + if(err) + errno = EACCES; + CloseHandle(proc_hand); + return err; +} diff -Nru gmp-ecm-7.0.4+ds/build.vc15/getrusage.h gmp-ecm-7.0.5+ds/build.vc15/getrusage.h --- gmp-ecm-7.0.4+ds/build.vc15/getrusage.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/getrusage.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,46 @@ + +#ifndef _GETRUSAGE_H +#define _GETRUSAGE_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define ENODATA 61 +#define RUSAGE_SELF 0 +#define RUSAGE_CHILDREN -1 + +typedef struct +{ + long tv_sec; + long tv_usec; +} tval; + +typedef struct rusage +{ + tval ru_utime; /* user time used */ + tval ru_stime; /* system time used */ + long ru_maxrss; /* integral max resident set size */ + long ru_ixrss; /* integral shared text memory size */ + long ru_idrss; /* integral unshared data size */ + long ru_isrss; /* integral unshared stack size */ + long ru_minflt; /* page reclaims */ + long ru_majflt; /* page faults */ + long ru_nswap; /* swaps */ + long ru_inblock; /* block input operations */ + long ru_oublock; /* block output operations */ + long ru_msgsnd; /* messages sent */ + long ru_msgrcv; /* messages received */ + long ru_nsignals;/* signals received */ + long ru_nvcsw; /* voluntary context switches */ + long ru_nivcsw; /* involuntary context switches */ +} rusage; + +int getrusage(int who, rusage *usage); + +#if defined(__cplusplus) +} +#endif + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vc15/gettimeofday.c gmp-ecm-7.0.5+ds/build.vc15/gettimeofday.c --- gmp-ecm-7.0.4+ds/build.vc15/gettimeofday.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/gettimeofday.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,39 @@ + +#define WIN32_LEAN_AND_MEAN +#include +#include + +#include "gettimeofday.h" + +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + FILETIME ft; + LARGE_INTEGER li; + __int64 t; + static int tzflag; + + if(tv) + { + GetSystemTimeAsFileTime(&ft); + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + t = li.QuadPart; + t -= EPOCHFILETIME; + t /= 10; + tv->tv_sec = (long)(t / 1000000); + tv->tv_usec = (long)(t % 1000000); + } + + if (tz) + { + if (!tzflag) + { + _tzset(); + tzflag++; + } + tz->tz_minuteswest = _timezone / 60; + tz->tz_dsttime = _daylight; + } + + return 0; +} diff -Nru gmp-ecm-7.0.4+ds/build.vc15/gettimeofday.h gmp-ecm-7.0.5+ds/build.vc15/gettimeofday.h --- gmp-ecm-7.0.4+ds/build.vc15/gettimeofday.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/gettimeofday.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,34 @@ +/* + * timeval.h 1.0 01/12/19 + * + * Defines gettimeofday, timeval, etc. for Win32 + * + * By Wu Yongwei + * + */ +#ifndef _TIMEVAL_H +#define _TIMEVAL_H + +#include + +#define EPOCHFILETIME (116444736000000000LL) + +#if defined(__cplusplus) +extern "C" +{ +#endif + +struct timezone +{ + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +int gettimeofday(struct timeval *tv, struct timezone *tz); + +#if defined(__cplusplus) +} +#endif + +#endif /* _TIMEVAL_H */ + diff -Nru gmp-ecm-7.0.4+ds/build.vc15/libecm/libecm.vcxproj gmp-ecm-7.0.5+ds/build.vc15/libecm/libecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/libecm/libecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/libecm/libecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,253 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {CD555681-D65B-4173-A29C-B8BF06A4871B} + libecm + Win32Proj + 10.0.16299.0 + + + + StaticLibrary + v141 + + + StaticLibrary + v141 + + + StaticLibrary + v141 + + + StaticLibrary + Static + v141 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + ..\prebuild NO_GPU + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + Default + true + + + + $(IntDir)%(Filename).obj + + + + + ..\prebuild NO_GPU + + + X64 + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + + + Level3 + Default + true + + + + + _WIN64 + $(IntDir)%(Filename).obj + + + + + ..\prebuild NO_GPU + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + $(IntDir)%(Filename).obj + + + + + ..\prebuild NO_GPU + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + + _WIN64 + $(IntDir)%(Filename).obj + + + + + + + + + + + + + + + + + + + + + + + true + true + true + true + + + + + + + + + + + + + + + + + + + Full + + + Full + + + Full + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/libecm/libecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/libecm/libecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/libecm/libecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/libecm/libecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,190 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {2f18179f-5dba-420c-8dc7-bc7f8228a1b2} + + + + + Source Files\Assembler + + + Source Files\Assembler + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/libecm/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/libecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/libecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/libecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = libecm.vcxproj libecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj gmp-ecm-7.0.5+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,311 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} + libecm_gpu + Win32Proj + 10.0.17134.0 + + + + StaticLibrary + v141 + + + StaticLibrary + v141 + + + StaticLibrary + v141 + + + StaticLibrary + Static + v141 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + ecmlib + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + ..\prebuild GPU + + + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + + + Level3 + Default + Full + + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 32 + + + true + + + + + ..\prebuild GPU + + + X64 + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + + + Level3 + Default + true + + + + + _WIN64 + + + compute_50,sm_50 + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 64 + + + true + + + + + + + + + ..\prebuild GPU + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + Default + true + + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 32 + + + true + + + + + ..\prebuild GPU + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + Default + true + + + + + _WIN64 + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 64 + + + true + + + + + true + true + true + + + + + + + + + + + + + + + + true + true + true + true + + + + + + + + + + + + + + + + + + + Full + + + Full + + + Full + + + + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + Document + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/libecm_gpu/libecm_gpu.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,193 @@ + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + {dfe792df-b4ff-4147-be95-190117baae33} + + + {0315d9d5-3f8f-456a-ae54-e00de69b9350} + + + {cbe6b893-95dc-4f4b-b2e9-73245cf57c75} + + + + + Source Files + + + + + Source Files\Assembler + + + Source Files\Assembler + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/libecm_gpu/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/libecm_gpu/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/libecm_gpu/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/libecm_gpu/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = libecm_gpu.vcxproj libecm_gpu.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,7 @@ +EXTRA_DIST = config.h ecm.sln ecm_gpu.sln file_copy.bat gen_ecm_h.bat \ + getopt.c getopt.h getrusage.c getrusage.h gettimeofday.c \ + gettimeofday.h mp_lib.props out_copy_rename.bat prebuild.bat \ + python.bat readme.txt tests.py vacopy.c vsyasm.props \ + vsyasm.targets vsyasm.xml + +DIST_SUBDIRS = assembler ecm ecm_gpu libecm libecm_gpu tune bench_mulredc diff -Nru gmp-ecm-7.0.4+ds/build.vc15/mp_lib.props gmp-ecm-7.0.5+ds/build.vc15/mp_lib.props --- gmp-ecm-7.0.4+ds/build.vc15/mp_lib.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/mp_lib.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,27 @@ + + + + mpir\lib\ + mpir.lib + + + <_ProjectFileVersion>10.0.30128.1 + + + + + + MultiThreaded + + + + + $(mp_dir) + true + + + $(mp_lib) + true + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/multiecm/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/multiecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/multiecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/multiecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = multiecm.vcxproj multiecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/multiecm/multiecm.vcxproj gmp-ecm-7.0.5+ds/build.vc15/multiecm/multiecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/multiecm/multiecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/multiecm/multiecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,238 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + multiecm + Win32Proj + 10.0.16299.0 + {16434DC2-371C-451B-A336-820499B98B8C} + + + + Application + v141 + + + Application + v141 + + + Application + v141 + + + Application + v141 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + false + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + false + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/multiecm/multiecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/multiecm/multiecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/multiecm/multiecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/multiecm/multiecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,71 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/out_copy_rename.bat gmp-ecm-7.0.5+ds/build.vc15/out_copy_rename.bat --- gmp-ecm-7.0.4+ds/build.vc15/out_copy_rename.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/out_copy_rename.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,31 @@ +@echo off +if not exist %1 goto nofile +if exist %2 goto next + +echo creating directory %2 +md %2 > nul + +:next +rem strip quotes if present +set str=%2 +for /f "useback tokens=*" %%a in ('%str%') do set str=%%~a + +rem add a backslash if the output directory lacks one +set str=%str:~-1% +if "%str%" == "\" (set outf=%2%3) else (set outf=%2\%3) + +echo copying %1 to %outf% (if not present or changed) +if not exist "%outf%" goto copy + +rem don't overwrite if output exists and is not changed +fc %1 %outf% > nul && if not %errorlevel 1 goto exit +echo overwriting %outf% with %1 + +:copy +copy %1 %outf% > nul +goto exit + +:nofile +echo %1 not found + +:exit diff -Nru gmp-ecm-7.0.4+ds/build.vc15/prebuild.bat gmp-ecm-7.0.5+ds/build.vc15/prebuild.bat --- gmp-ecm-7.0.4+ds/build.vc15/prebuild.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/prebuild.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +cd ..\ +call out_copy_rename config.h ..\ config.h +call gen_ecm_h diff -Nru gmp-ecm-7.0.4+ds/build.vc15/python.bat gmp-ecm-7.0.5+ds/build.vc15/python.bat --- gmp-ecm-7.0.4+ds/build.vc15/python.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/python.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,2 @@ +@echo off +"c:\program files\python34\python" %1 diff -Nru gmp-ecm-7.0.4+ds/build.vc15/readme.txt gmp-ecm-7.0.5+ds/build.vc15/readme.txt --- gmp-ecm-7.0.4+ds/build.vc15/readme.txt 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/readme.txt 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,177 @@ + +Building GMP-ECM with Microsoft Visual C++ 2017 +=============================================== + +If you wish to build the assembler code support you will need to +install the YASM assembler that is available at: + + http://www.tortall.net/projects/yasm/ + +THe version you need is vsyasm, which should be put it in the directory + + C:\Program Files\yasm + +Alternatively vsyasm can be installed anywhere provided the environment +variable YASMPATH is set to its absolute file path. + +The Multi-Precision Library - GMP and MPIR +========================================== + +GMP-ECM works with either GMP or MPIR, a fork of GMP. To build and run +GMP-ECM using Visual Studio you first need to obtain and build either +GMP or MPIR. MPIR has a fully integrated Visual Studio build system +for Windows but GMP does not. + +The VC++ build of GMP-ECM now defaults to MPIR but the property sheet +mp_lib.vsprops can be edited to set the macro mp_lib to 'gmp' instead +of 'mpir' to build ECM using GMP. + +GMP +=== + +GMP can be built from the GMP source code available here: + + http://gmplib.org/ + +GMP can be built with mingw for 32-bit Windows and mingw64 for Windows x64. +It is reported that the resulting libraries work with Visual Studio when +appropriately renamed. + +MPIR +==== + +MPIR is available here: + + http://www.mpir.org + +It has full support for building MPIR for 32 and 64 bit Windows systems +with x86 assembler support using the YASM assembler. + +Building GMP-ECM +================ + +The build files for GMP-ECM assume that the GMP and ECM build directories +are in a common parent directory as follows: + + Parent Directory + MPIR (or GMP) + build.vc15 -- MPIR (or GMP) build files + ... + GMP-ECM + buid.vc15 -- ECM build files + +The root directories for GMP and GMP-ECM are assumed to have these names +irrespective of which version is being used (they used to be followed by +version numbers but this meant that the build projects had to be updated +too frequently). + +The normal (non GPU) build is opened by loading the file ecm.sln (from +the build.vc14 directory) into Visual Studio. This provides these build +projects in build.vc15 for the non GPU build: + + ecm - the ECM application + ecmlib - the ECM library + tune - a program for tuning + bench_mulredc - for benchmarking mulredc + multiecm - work in progress (not working) + +The GPU build is opened by loading the file ecm.sln (from the build.vc14 +directory) into Visual Studio. This provides two build projects in +build.vc15: + + ecm_gpu - the ECM application + ecmlib_gpu - the ECM library + +In all cases you have to choose either a win32 or x64 build and either a +Release or Debug configuration (however the win32 builds are no longer +actively supported and may not work). + +The non GPU Build +----------------- + +Before starting a build, there are a number of configuration options +that need to be set: + +1. If you wish to compile GMP-ECM for use on a particular processor, + select the appropriate define from the file 'ecm-params.h' in the + GMP-ECM root directory and decide which of the defines suit your + needs (e.g. __tune_corei7__). Then replace the existing define: + + /* define Windows tuning here */ + # define __tune_corei7__ + + towards the end of the file config.h file in the 'build.vc14' + directory (build.vc14\config.h) with the chosen define. + +2. The file at 'build.vc14\mul_fft-params.h' allows the FFT code to + be tuned to 32 or 64-bit systems by selecting an option by + changing the appropriate '#elif 0' to #elif 1'. If you wish to + use the win32 AMD assembler files, you also have to use the + Visual Studio property page to define AMD_ASM (alternatively + you can edit the two files mulredc.asm and redc.asm in the + build.vc14\assembler\ directory to include the AMD assembler). + +The GPU Build +------------- + +1. If you wish to build with a GPU capability you will need to + install Nvidia Nsight for Visual Studio version 5.4 and the + CUDA Toolkit v9.0. You can then build the libecm_gpu and + ecm_gpu projects + +2. The choices above for the non GPU build aslo apply when + building for a GPU based system. + + By default, the GPU configuration is "compute_50,sm_50". If + you need to change this, select libecm_gpu and ecm_gpu and + set the propertiesfor "CUDA C/C++|Device|Code Generation" for + your GPU capability. + + Also under "C/C++|Preprocessor|Preprocessor Definitions" for + both these projects, change the current definition GPU_CC50 to + that for your GPU capability + +Build Configurations +-------------------- + +When a version of ecm and ecmlib are built, the library and the application +are put in the directory matching the configuration that has been built: + + GMP-ECM + build.vc15 -- ECM build files + lib -- ECM static library files + bin -- ECM executable files + +within these lib, dll and bin directories, the outputs are located in +sub-directories determined by the platform and configuration: + + win32\release + win32\debug + x64\release + x64\debug + +If you don't want assembler support you need to change the define: + +#define NATIVE_REDC 1 + +in config.h (in the build.vc14 subdirectory) to: + +#undef NATIVE_REDC + +Tune +==== + +If tune is compiled and run for a particular configuration it will output +suitable values for optimising GMP-ECM to the console window. To optimise +GMP-ECM these values should be put in a suitably named file whcih then has +to be integrated in ecm-params.h. + +Tests +===== + +The file test.py is a python script that runs the ECM tests. It runs the +x64/release-amd (non GPU) version by default but can be edited to test other +builds. It cannot run some tests as a result of the diifficulty in the +conversion of the Unix shell scripts for the tests for use on Windows. + + Brian Gladman, November 2017 diff -Nru gmp-ecm-7.0.4+ds/build.vc15/test.ecm.save gmp-ecm-7.0.5+ds/build.vc15/test.ecm.save --- gmp-ecm-7.0.4+ds/build.vc15/test.ecm.save 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/test.ecm.save 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +METHOD=ECM; PARAM=0; SIGMA=585928442; B1=174000; N=17061648125571273329563156588435816942778260706938821014533; X=0x1a2a694df04a5c037fd12f42668b474f16b7818933f4c8484; CHECKSUM=1505596339; PROGRAM=GMP-ECM 7.0-dev; Y=0x0; X0=0x0; Y0=0x0; WHO=brian@MEGA-SLAVE; TIME=Fri Feb 13 09:01:45 2015; diff -Nru gmp-ecm-7.0.4+ds/build.vc15/tests.py gmp-ecm-7.0.5+ds/build.vc15/tests.py --- gmp-ecm-7.0.4+ds/build.vc15/tests.py 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/tests.py 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,150 @@ + +from __future__ import print_function +import os +import sys +import string +import platform +from re import match +from subprocess import Popen, PIPE, STDOUT +from tempfile import * + +try: + from time import perf_counter as timer +except ImportError: + from time import clock as timer + + +x64 = True +debug = False +test_gpu_version = True +run_non_gpu_tests = True +run_gpu_tests = True + +class Timer() : + def __enter__(self): self.start = timer() + def __exit__(self, *args): print(' time {:.3f} milliseconds'.format(1000 * (timer() - self.start))) + +cpath = os.path.dirname(__file__) +config = 'x64' if x64 else 'Win32' +mode = 'Debug' if debug else 'Release' +test_dir = '..\\bin\\{:s}\\{:s}\\'.format(config, mode) + +def get_tests(filename): + print('running tests in {:s}'.format(filename)) + start, sub, tests, c_tests = True, dict(), [], [] + with open(os.path.join(cpath, filename)) as f: + lines = f.readlines() + cnt, lnth = 0, len(lines) + while cnt < lnth: + try: + line = lines[cnt].strip() + cnt += 1 + tkns = line.split() + if line.startswith('echo') and len(tkns) > 2 and tkns[2] == '|': + while cnt < lnth and 'checkcode' not in line: + while cnt < lnth and not lines[cnt]: + cnt += 1 + if cnt < lnth: + line += '|' + lines[cnt] + cnt += 1 + start = False + elif start: + sp = line.split('="') + if len(sp) == 2: + if sp[1].startswith('${1:-./ecm}'): + sub[sp[0]] = sp[1][12:-1] + else: + sub[sp[0]] = sp[1][:-1] + continue + else: + continue + line = line.replace(';', '|') + sub_tests = [] + for line_part in line.split('|'): + tkns = line_part.strip().split() + cmd = [] + for tok in tkns: + if tok.startswith('"') and tok.endswith('"'): + tok = tok[1:-1] + if tok[0] == '$' and tok[1:] in sub: + tok = tok.replace(tok, sub[tok[1:]]) + elif tok == './ecm': + tok = '' + cmd += [tok] + cseq = [] + if cmd and cmd[0] == 'echo': + cseq += [cmd[1]] + cmd = cmd[2:] + if len(cmd) >= 3 and cmd[-3] == 'checkcode' and cmd[-2] == '$?': + cseq += [int(cmd[-1])] + cmd = cmd[:-3] + cmd = (' '.join(cmd)).strip() + if cmd: + cseq += [cmd] + sub_tests += [cseq] + if len(sub_tests) == 3 and all(len(x) == 1 for x in sub_tests): + tests += [tuple(x[0] for x in sub_tests)] + else: + c_tests += [sub_tests] + except ValueError: + print('parsing error on line {} in text "{}"'.format(cnt, line)) + return tests, c_tests + +def run_exe(exe, args, inp) : + al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT } + if sys.platform.startswith('win'): + al['creationflags'] = 0x08000000 + p = Popen([os.path.join(cpath, exe)] + args.split(' '), **al) + res = p.communicate(inp.encode())[0].decode() + ret = p.poll() + return (ret, res) + +def output_complex_tests(x): + print('these tests are too complex:') + for t in x: + print(t) + +def do_tests(tests, ctests, out=False, gpu=False): + ecm_exe = test_dir + ("ecm_gpu.exe" if gpu else "ecm.exe") + err_cnt = 0 + for ix, tt in enumerate(tests): + print(tt[1], tt[0], end='') + rv = run_exe(ecm_exe, tt[1], tt[0]) + if type(tt[2]) == int and rv[0] != tt[2]: + print(" - *** ERROR in test {:d}: {:d} {:d} ***".format(ix, rv[0], tt[2])) + err_cnt += 1 + elif type(tt[2]) == tuple and rv[0] != tt[2][0] and rv[0] != tt[2][1]: + print(" - *** ERROR in test {:d}: {:d} {:s} ***".format(ix, rv[0], tt[2])) + err_cnt += 1 + else: + print(" - passed") + if out: + op = rv[1].rsplit('\r\n') + for i in op : + print(i) + + if ctests: + output_complex_tests(ctests) + if not err_cnt: + if ctests: + print('all other tests passed') + else: + print('all tests passed') + +with Timer(): + if os.path.exists('test.pm1.save'): + os.remove('test.pm1.save') + if run_non_gpu_tests: + t, ct = get_tests("..\\test.ecm") + do_tests(t, ct) + t, ct = get_tests("..\\test.pm1") + do_tests(t, ct) + t, ct = get_tests("..\\test.pp1") + do_tests(t, ct) + t, ct = get_tests("..\\testlong.pp1") + do_tests(t, ct) + t, ct = get_tests("..\\testlong.pm1") + do_tests(t, ct) + if run_gpu_tests: + t, ct = get_tests("..\\test.gpuecm") + do_tests(t, ct, gpu=True) diff -Nru gmp-ecm-7.0.4+ds/build.vc15/tune/Makefile.am gmp-ecm-7.0.5+ds/build.vc15/tune/Makefile.am --- gmp-ecm-7.0.4+ds/build.vc15/tune/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/tune/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = tune.vcxproj tune.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vc15/tune/tune.vcxproj gmp-ecm-7.0.5+ds/build.vc15/tune/tune.vcxproj --- gmp-ecm-7.0.4+ds/build.vc15/tune/tune.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/tune/tune.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,164 @@ + + + + + Release + Win32 + + + Release + x64 + + + + {80E08750-5C6C-492E-BB1E-7200978AE125} + tune + Win32Proj + 10.0.16299.0 + + + + Application + Unicode + true + v141 + + + Application + NotSet + v141 + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + + + + MaxSpeed + true + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);%(AdditionalDependencies) + true + Console + true + true + MachineX86 + + + + + + + + + $(IntDir)%(FileName).obj + + + + + X64 + + + MaxSpeed + true + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);%(AdditionalDependencies) + true + Console + true + true + MachineX64 + + + + + + + _WIN64 + $(IntDir)%(FileName).obj + + + + + + + + + + + + + + + TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) + TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/tune/tune.vcxproj.filters gmp-ecm-7.0.5+ds/build.vc15/tune/tune.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vc15/tune/tune.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/tune/tune.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,109 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {38f1a18f-40fc-4eed-a68e-e79b58327b6c} + + + + + Source Files\Assembler + + + Source Files\Assembler + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/tune.txt gmp-ecm-7.0.5+ds/build.vc15/tune.txt --- gmp-ecm-7.0.4+ds/build.vc15/tune.txt 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/tune.txt 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,11 @@ +#define LIST_MUL_TABLE {0,0,0,0,0,0,0,0,0,0,3,3,3,3,1,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3} +#define MPZMOD_THRESHOLD 25 +#define REDC_THRESHOLD 512 +#define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 1} +#define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 +#define NTT_GFP_TWIDDLE_DIT_BREAKOVER 16 +#define MUL_NTT_THRESHOLD 262144 +#define PREREVERTDIVISION_NTT_THRESHOLD 131072 +#define POLYINVERT_NTT_THRESHOLD 262144 +#define POLYEVALT_NTT_THRESHOLD 32768 +#define MPZSPV_NORMALISE_STRIDE 512 diff -Nru gmp-ecm-7.0.4+ds/build.vc15/vacopy.c gmp-ecm-7.0.5+ds/build.vc15/vacopy.c --- gmp-ecm-7.0.4+ds/build.vc15/vacopy.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/vacopy.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,8 @@ + +#include +#include + +void _vacopy(va_list *pap, va_list ap) +{ + *pap = ap; +} diff -Nru gmp-ecm-7.0.4+ds/build.vc15/vsyasm.props gmp-ecm-7.0.5+ds/build.vc15/vsyasm.props --- gmp-ecm-7.0.4+ds/build.vc15/vsyasm.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/vsyasm.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,27 @@ + + + + Midl + CustomBuild + + + _SelectedFiles;$(YASMDependsOn) + + + C:\Program Files\yasm\ + + + + False + $(IntDir)%(FileName).obj + 0 + 0 + "$(YasmPath)"vsyasm.exe -Xvc -f $(Platform) [AllOptions] [AdditionalOptions] [Inputs] + %(ObjectFile) + Assembling %(Filename)%(Extension) ==> $(IntDir)%(FileName).obj + false + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/vsyasm.targets gmp-ecm-7.0.5+ds/build.vc15/vsyasm.targets --- gmp-ecm-7.0.4+ds/build.vc15/vsyasm.targets 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/vsyasm.targets 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,110 @@ + + + + + + + _YASM + + + + + + $(ComputeLinkInputsTargets); + ComputeYASMOutput; + + + $(ComputeLibInputsTargets); + ComputeYASMOutput; + + + + + $(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml + + + + + + <_YASMReadTlog + Include="^%(YASM.FullPath);%(YASM.AdditionalDependencies)" + Condition="'%(YASM.ExcludedFromBuild)' != 'true' and '%(YASM.ObjectFile)' != ''"/> + <_YASMWriteTlog + Include="^%(YASM.FullPath);$([MSBuild]::NormalizePath('$(MSBuildProjectDirectory)', '%(YASM.ObjectFile)'))" + Condition="'%(YASM.ExcludedFromBuild)' != 'true' and '%(YASM.ObjectFile)' != ''"/> + + + + + + + + <_YASMReadTlog Remove="@(_YASMReadTlog)" /> + <_YASMWriteTlog Remove="@(_YASMWriteTlog)" /> + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vc15/vsyasm.xml gmp-ecm-7.0.5+ds/build.vc15/vsyasm.xml --- gmp-ecm-7.0.4+ds/build.vc15/vsyasm.xml 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vc15/vsyasm.xml 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,283 @@ + + + + + + + + + + + + + General + + + + + + Symbols + + + + + + Files + + + + + + Command Line + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Execute Before + + + Specifies the targets for the build customization to run before. + + + + + + + + + + + + Execute After + + + Specifies the targets for the build customization to run after. + + + + + + + + + + + + + + + + + + Additional Options + + + Additional Options + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/a_win32a_redc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/a_win32a_redc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/a_win32a_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/a_win32a_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,133 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro seq 3 + mov eax, [byte esi+4*%3] + mul ebp + add [byte edi+4*%3], %2 + adc %1, eax + mov %2, edx + adc %2, 0 +%endmacro + + text + global _ecm_redc3 + +_ecm_redc3: + push ebp + push edi + push esi + push ebx + sub esp, 16 + mov ecx, [esp+44] + mov edi, [esp+36] + mov [esp], ecx + cmp ecx, 5 + jae .3 + +.1: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + xor ebx, ebx + +.2: mov eax, [esi] + add edi, 4 + mul ebp + add esi, 4 + add eax, ebx + adc edx, 0 + add [edi-4], eax + adc edx, 0 + dec ecx + mov ebx, edx + jnz .2 + mov edi, [esp+36] + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .1 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + +.3: mov edx, ecx + dec ecx + sub edx, 2 + neg ecx + shr edx, 4 + and ecx, 15 + mov [esp+8], edx + mov edx, ecx + shl edx, 4 + neg ecx + lea edx, [edx+ecx+.6] + mov [esp+44], ecx + mov [esp+12], edx + +.4: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + mov edx, [esp+8] + mov [esp+4], edx + mov eax, [esi] + lea esi, [esi+ecx*4+4] + mul ebp + lea edi, [edi+ecx*4] + mov ebx, edx + mov edx, [esp+12] + test ecx, 1 + mov ecx, eax + cmovnz ecx, ebx + cmovnz ebx, eax + jmp edx + + align 32 +.5: add edi, 64 +.6: + +%assign i 0 +%rep 16 + %if (i & 1) + seq ecx, ebx, i + %else + seq ebx, ecx, i + %endif + %assign i i + 1 +%endrep + + dec dword [esp+4] + lea esi, [esi+64] + jns .5 + + add [edi+64], ecx + mov edi, [esp+36] + adc ebx, 0 + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .4 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + + end + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/a_win32p_mulredc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/a_win32p_mulredc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/a_win32p_mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/a_win32p_mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,148 @@ + +; Part of GMP-ECM +; +; mp_limb_t mulredc1( 1 limb +; mp_limb_t *z, +; const mp_limb_t x, +; const mp_limb_t y, +; const mp_limb_t m, +; mp_limb_t inv_m +; ) +; +; mp_limb_t mulredc( > 1 limb +; mp_limb_t *z, +; const mp_limb_t *x, +; const mp_limb_t *y, +; const mp_limb_t *m, +; mp_limb_t inv_m +; ) + +%macro mseq 1 + movd mm1, [esi+4*%1] + movd mm2, [edi+4*%1] + pmuludq mm1, mm7 + paddq mm2, mm1 + paddq mm0, mm2 + movd [edi+4*%1], mm0 + psrlq mm0, 32 +%endmacro + +%macro mulredc 1 +%assign limbs %1 +%define f_name(x) _mulredc %+ x + + global f_name(limbs) +%ifdef DLL + export f_name(limbs) +%endif + +f_name(limbs): + push ebp + push edi + push esi + push ebx + sub esp, 8*(limbs+1) + mov edi, esp + +%assign i 0 +%rep 2 * limbs + 1 + mov dword [edi+4*i], 0 + %assign i i + 1 +%endrep + + mov dword [esp+8*limbs+4], limbs + + align 32 + +.1: mov eax, [esp+8*limbs+32] + mov esi, [esp+8*limbs+36] + mov eax, [eax] + mul dword [esi] + add eax, [edi] + mul dword [esp+8*limbs+44] + mov ebp, eax + mov esi, [esp+8*limbs+40] + + pxor mm0, mm0 + movd mm7, ebp + +%assign i 0 +%rep limbs + mseq i + %assign i i + 1 +%endrep + + movd ecx, mm0 + + add [edi+4*limbs], ecx + adc dword [edi+4*limbs+4], 0 + mov eax, [esp+8*limbs+32] + mov ebp, [eax] + mov esi, [esp+8*limbs+36] + + pxor mm0, mm0 + movd mm7, ebp + +%assign i 0 +%rep limbs + mseq i + %assign i i + 1 +%endrep + + movd ecx, mm0 + add [edi+4*limbs], ecx + adc dword [edi+4*limbs+4], 0 + add dword [esp+8*limbs+32], 4 + add edi, 4 + dec dword [esp+8*limbs+4] + jnz .1 + + mov ebx, [esp+8*limbs+28] + +%assign i 0 +%rep limbs + mov eax, [edi+4*i] + mov [ebx+4*i], eax + %assign i i + 1 +%endrep + mov eax, [edi+4*limbs] + add esp, 8*(limbs+1) + + pop ebx + pop esi + pop edi + pop ebp + emms + ret +%endmacro + + bits 32 + section .text + + global _mulredc1 +%ifdef DLL + export _mulredc1 +%endif + +_mulredc1: + mov eax, [esp+12] + mul dword [esp+8] + mov [esp+12], edx + mov [esp+8], eax + mul dword [esp+20] + mul dword [esp+16] + add eax, [esp+8] + adc edx, [esp+12] + mov ecx, [esp+4] + mov [ecx], edx + adc eax, 0 + ret + +%assign i 2 +%rep 19 ; 3..20 inclusive + mulredc i + %assign i i + 1 +%endrep + + end + diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/a_win32p_redc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/a_win32p_redc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/a_win32p_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/a_win32p_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,145 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro rloop 3 + mov eax, [byte esi+4*%3] + mul ebp + add [byte edi+4*%3], %2 + adc %1, eax + mov %2, edx + adc %2, 0 +%endmacro + + bits 32 + section .text + + global _ecm_redc3 +%ifdef DLL + export _ecm_redc3 +%endif + +_ecm_redc3: + push ebp + push edi + push esi + push ebx + sub esp, 16 + + mov ecx, [esp+44] + mov edi, [esp+36] + mov [esp], ecx + cmp ecx, 5 + jae .unroll + +.1: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + xor ebx, ebx + +.2: mov eax, [esi] + add edi, 4 + mul ebp + add esi, 4 + add eax, ebx + adc edx, 0 + add [edi-4], eax + adc edx, 0 + dec ecx + mov ebx, edx + jnz .2 + mov edi, [esp+36] + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .1 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + +.unroll: + mov edx, ecx + dec ecx + sub edx, 2 + neg ecx + shr edx, 4 + and ecx, 15 + mov [esp+8], edx + mov edx, ecx + shl edx, 4 + neg ecx + lea edx, [edx+ecx*1+.loop_base] + mov [esp+44], ecx + mov [esp+12], edx + +.4: mov ebp, [esp+48] + mov esi, [esp+40] + imul ebp, [edi] + mov [esp+36], edi + mov ecx, [esp+44] + mov edx, [esp+8] + mov [esp+4], edx + mov eax, [esi] + lea esi, [esi+ecx*4+4] + mul ebp + lea edi, [edi+ecx*4] + mov ebx, edx + mov edx, [esp+12] + test ecx, 1 + mov ecx, eax + cmovnz ecx, ebx + cmovnz ebx, eax + jmp edx + + align 32 +.5: add edi, 64 +.loop_base: + rloop ebx, ecx, 0 + rloop ecx, ebx, 1 + rloop ebx, ecx, 2 + rloop ecx, ebx, 3 + rloop ebx, ecx, 4 + rloop ecx, ebx, 5 + rloop ebx, ecx, 6 + rloop ecx, ebx, 7 + rloop ebx, ecx, 8 + rloop ecx, ebx, 9 + rloop ebx, ecx, 10 + rloop ecx, ebx, 11 + rloop ebx, ecx, 12 + rloop ecx, ebx, 13 + rloop ebx, ecx, 14 + rloop ecx, ebx, 15 + + dec dword [esp+4] + lea esi, [esi+64] + jns .5 + + add [edi+64], ecx + mov edi, [esp+36] + adc ebx, 0 + mov [edi], ebx + dec dword [esp] + lea edi, [edi+4] + jnz .4 + + add esp, 16 + pop ebx + pop esi + pop edi + pop ebp + ret + + end diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/a_x64_mulredc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/a_x64_mulredc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/a_x64_mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/a_x64_mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,237 @@ +; +; Part of GMP-ECM +; +; mp_limb_t mulredc1( MSVC 1 limb +; mp_limb_t *z, rcx +; const mp_limb_t x, rdx +; const mp_limb_t y, r8 +; const mp_limb_t m, r9 +; mp_limb_t inv_m [rsp+0x28] +; ) +; +; mp_limb_t mulredc( MSVC > 1 limb +; mp_limb_t *z, rcx +; const mp_limb_t *x, rdx +; const mp_limb_t *y, r8 +; const mp_limb_t *m, r9 +; mp_limb_t inv_m [rsp+0x28] +; ) + +%macro mseq_1 4 + mov %2, rcx + mul r14 + add %1, rax + mov rax, [r9+8*%3] + adc %2, rdx + mul r11 +%if %3 < %4 - 1 + add rax, %1 + mov [rbp+8*(%3-1)], rax + mov rax, [r8+8*(%3+1)] + adc %2, rdx + setc cl +%else + add %1, rax + mov [rbp+8*(%3-1)], %1 + adc %2, rdx + mov [rbp+8*%3], %2 + setc cl + mov [rbp+8*(%3+1)], rcx +%endif +%endmacro + +%macro mseq_20 2 + mov r14, [r13+r12*8] + mov rax, [r8] + mov %1, [rbp] + mov %2, [rbp+8] + mul r14 + add r12, 1 + add rax, %1 + adc %2, rdx + setc cl + mov %1, rax + imul rax, r10 + mov r11, rax + mul qword [r9] + add %1, rax + adc %2, rdx + mov rax, [r8+8] +%endmacro + +%macro mseq_2 4 + mov %2, [rbp+8*(%3+1)] + adc %2, rcx +%if %3 < %4 - 1 + setc cl +%endif + mul r14 + add %1, rax + mov rax, [r9+8*%3] + adc %2, rdx +%if %3 < %4 - 1 + adc cl, 0 +%else + setc cl +%endif + mul r11 +%if %3 < %4 - 1 + add rax, %1 + mov [rbp+8*(%3-1)], rax + adc %2, rdx + mov rax, [r8+8*(%3+1)] +%else + add %1, rax + mov [rbp+8*(%3-1)], %1 + adc %2, rdx + mov [rbp+8*%3],%2 + adc cl, 0 + mov [rbp+8*(%3+1)], rcx +%endif +%endmacro + +%macro store 1 +%assign i 0 +%rep %1 + %if i == %1 - 1 && (%1 & 1) + mov rax, [rbp+8*i] + mov [rdi+8*i], rax + %elif (i & 1) + mov [rdi+8*(i-1)], rax + mov [rdi+8*i], rdx + %else + mov rax, [rbp+8*i] + mov rdx, [rbp+8*(i+1)] + %endif + %assign i i + 1 +%endrep +%endmacro + +%macro mulredc 1 + +%assign limbs %1 +%define f_name(x) mulredc %+ x +%define stack_space 8 * (limbs + 1 + (limbs & 1)) + + global f_name(limbs) +%ifdef DLL + export f_name(limbs) +%endif + + align 64 + +PROC_FRAME f_name(limbs) ; SEH Frame + push_reg rbp + push_reg rbx + push_reg rsi + push_reg rdi + push_reg r12 + push_reg r13 + push_reg r14 + alloc_stack stack_space +END_PROLOGUE + ; *y in r8 + mov rdi, rcx ; *z -> rdi + mov r13, rdx ; *x -> r13 + mov r10, [rsp+8*12+stack_space] ; invm -> r10 + ; *m in r9 + mov r14, [r13] + mov rax, [r8] + xor rcx, rcx + lea rbp, [rsp] + mov r12, rcx + mul qword r14 + add r12, 1 + mov rsi, rax + mov rbx, rdx + imul rax, r10 + mov r11, rax + mul qword [r9] + add rsi, rax + mov rax, [r8+8] + adc rbx, rdx + setc cl + +%assign j 1 +%rep limbs - 1 +%if (j & 1) + mseq_1 rbx, rsi, j, limbs +%else + mseq_1 rsi, rbx, j, limbs +%endif + %assign j j + 1 +%endrep + + align 32 +.1: + +%assign j 1 +%if (limbs & 1) + mseq_20 rsi, rbx + %rep limbs - 1 + %if (j & 1) + mseq_2 rbx, rsi, j, limbs + %else + mseq_2 rsi, rbx, j, limbs + %endif + %assign j j + 1 + %endrep +%else + mseq_20 rbx, rsi + %rep limbs - 1 + %if (j & 1) + mseq_2 rsi, rbx, j, limbs + %else + mseq_2 rbx, rsi, j, limbs + %endif + %assign j j + 1 + %endrep +%endif + + cmp r12, limbs + jb .1 + + store limbs + + mov rax, rcx + add rsp, stack_space + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + pop rbp + ret +ENDPROC_FRAME +%endmacro + + bits 64 + section .text + + global mulredc1 +%ifdef DLL + export mulredc1 +%endif + + align 64 +mulredc1: + mov rax, r8 + mul rdx + mov r10, rax + mov r11, rdx + mul qword [rsp+0x28] + mul r9 + add rax, r10 + adc rdx, r11 + mov [rcx], rdx + adc rax, 0 + ret + +%assign i 2 +%rep 19 ; 2..20 inclusive + mulredc i + %assign i i + 1 +%endrep + + end diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/a_x64_redc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/a_x64_redc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/a_x64_redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/a_x64_redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,161 @@ +; +; Part of GMP-ECM +; +; void ecm_redc3( +; mp_limb_t *z, rdi r8 <- rcx +; const mp_limb_t *x, rsi r9 <- rdx +; size_t n, rdx r10 <- r8 +; mp_limb_t m rcx r11 <- r9 +; ) + +%macro rloop 3 + mov rax,[byte rsi+8*%3] + mul rbp + add [byte rdi+8*%3], %1 + adc %2, rax + mov %1, rdx + adc %1, 0 +%endmacro + + bits 64 + section .text + + global ecm_redc3 +%ifdef DLL + export ecm_redc3 +%endif + +PROC_FRAME ecm_redc3 + push_reg rbp + push_reg rbx + push_reg rsi + push_reg rdi + alloc_stack 5*8 +END_PROLOGUE + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + + mov r8, rdi + mov r9, rsi + mov r10, rdx + mov r11, rcx + + mov rcx, r10 + mov [rsp], rcx + cmp rcx, 3 + jae .unroll + +.1: mov rbp, r11 + mov rsi, r9 + imul rbp, [rdi] + mov r8, rdi + mov rcx, r10 + xor rbx, rbx + +.2: mov rax, [rsi] + add rdi, 8 + mul rbp + add rsi, 8 + add rax, rbx + adc rdx, 0 + add [rdi-8], rax + adc rdx, 0 + dec rcx + mov rbx, rdx + jnz .2 + mov rdi, r8 + mov [rdi], rbx + dec qword [rsp] + lea rdi, [rdi+8] + jnz .1 + + add rsp, 5*8 + pop rdi + pop rsi + pop rbx + pop rbp + ret + +.unroll: + mov rdx, rcx + dec rcx + sub rdx, 2 + neg rcx + shr rdx, 4 + and rcx, 15 + mov [rsp+16], rdx + mov rdx, rcx + shl rdx, 4 + lea r10, [.loop_base wrt rip] + add rdx, r10 + lea rdx, [rdx+rcx*4] + add rdx, rcx + neg rcx + mov r10, rcx + mov [rsp+24], rdx + +.4: mov rbp, r11 + mov rsi, r9 + imul rbp, [rdi] + mov r8, rdi + mov rcx, r10 + mov rdx, [rsp+16] + mov [rsp+8], rdx + + mov rax, [rsi] + lea rsi, [rsi+rcx*8+8] + mul rbp + lea rdi, [rdi+rcx*8] + mov rbx, rdx + + mov rdx, [rsp+24] + test rcx, 1 + mov rcx, rax + cmovnz rcx, rbx + cmovnz rbx, rax + jmp rdx + + align 64 + +.5: add rdi, 128 +.loop_base: + rloop rcx, rbx, 0 + rloop rbx, rcx, 1 + rloop rcx, rbx, 2 + rloop rbx, rcx, 3 + rloop rcx, rbx, 4 + rloop rbx, rcx, 5 + rloop rcx, rbx, 6 + rloop rbx, rcx, 7 + rloop rcx, rbx, 8 + rloop rbx, rcx, 9 + rloop rcx, rbx, 10 + rloop rbx, rcx, 11 + rloop rcx, rbx, 12 + rloop rbx, rcx, 13 + rloop rcx, rbx, 14 + rloop rbx, rcx, 15 + + dec qword [rsp+8] + lea rsi, [rsi+128] + jns .5 + + add [rdi+128], rcx + mov rdi, r8 + adc rbx, 0 + mov [rdi], rbx + dec qword [rsp] + lea rdi, [rdi+8] + jnz .4 + + add rsp, 5*8 + pop rdi + pop rsi + pop rbx + pop rbp + ret +ENDPROC_FRAME + + end diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/Makefile.am gmp-ecm-7.0.5+ds/build.vs/assembler/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/assembler/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +EXTRA_DIST = a_win32a_mulredc.asm a_win32a_redc.asm a_win32p_mulredc.asm \ + a_win32p_redc.asm a_x64_mulredc.asm a_x64_redc.asm \ + test_mulredc.c mulredc.h mulredc.asm redc.asm diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/mulredc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/mulredc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/mulredc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/mulredc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,8 @@ + +%ifdef _WIN64 +%include "a_x64_mulredc.asm" +%elifdef AMD_ASM +%include "a_win32a_mulredc.asm" +%else +%include "a_win32p_mulredc.asm" +%endif diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/mulredc.h gmp-ecm-7.0.5+ds/build.vs/assembler/mulredc.h --- gmp-ecm-7.0.4+ds/build.vs/assembler/mulredc.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/mulredc.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,32 @@ +#ifndef __ASM_REDC_H__ +#define __ASM_REDC_H__ + +#include + +extern void ecm_redc3(mp_limb_t *cp, const mp_limb_t *np, mp_size_t nn, mp_limb_t Nprim); + + +/* WARNING: the size-1 version doesn't take pointers in input */ +extern mp_limb_t mulredc1(mp_limb_t *z, mp_limb_t x, mp_limb_t y, mp_limb_t m, mp_limb_t inv_m); + +extern mp_limb_t mulredc2(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc3(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc4(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc5(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc6(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc7(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc8(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc9(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc10(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc11(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc12(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc13(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc14(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc15(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc16(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc17(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc18(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc19(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); +extern mp_limb_t mulredc20(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/redc.asm gmp-ecm-7.0.5+ds/build.vs/assembler/redc.asm --- gmp-ecm-7.0.4+ds/build.vs/assembler/redc.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/redc.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,7 @@ +%ifdef _WIN64 +%include "a_x64_redc.asm" +%elif AMD_ASM +%include "a_win32a_redc.asm" +%else +%include "a_win32p_redc.asm" +%endif diff -Nru gmp-ecm-7.0.4+ds/build.vs/assembler/test_mulredc.c gmp-ecm-7.0.5+ds/build.vs/assembler/test_mulredc.c --- gmp-ecm-7.0.4+ds/build.vs/assembler/test_mulredc.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/assembler/test_mulredc.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,303 @@ +#include +#include +#include + +#include + +#include "asmredc.h" + +void mp_print(mp_limb_t *x, int N) { + int i; + for (i = 0; i < N-1; ++i) + printf("%lu + W*(", x[i]); + printf("%lu", x[N-1]); + for (i = 0; i < N-1; ++i) + printf(")"); + printf("\n"); +} + +static mp_limb_t +call_mulredc (int N, mp_limb_t *z, mp_limb_t *x, mp_limb_t *y, mp_limb_t *m, + mp_limb_t invm) +{ + mp_limb_t cy; + + switch (N) + { + case 1: + cy = mulredc1(z, x[0], y[0], m[0], invm); + break; + case 2: + cy = mulredc2(z, x, y, m, invm); + break; + case 3: + cy = mulredc3(z, x, y, m, invm); + break; + case 4: + cy = mulredc4(z, x, y, m, invm); + break; + case 5: + cy = mulredc5(z, x, y, m, invm); + break; + case 6: + cy = mulredc6(z, x, y, m, invm); + break; + case 7: + cy = mulredc7(z, x, y, m, invm); + break; + case 8: + cy = mulredc8(z, x, y, m, invm); + break; + case 9: + cy = mulredc9(z, x, y, m, invm); + break; + case 10: + cy = mulredc10(z, x, y, m, invm); + break; + case 11: + cy = mulredc11(z, x, y, m, invm); + break; + case 12: + cy = mulredc12(z, x, y, m, invm); + break; + case 13: + cy = mulredc13(z, x, y, m, invm); + break; + case 14: + cy = mulredc14(z, x, y, m, invm); + break; + case 15: + cy = mulredc15(z, x, y, m, invm); + break; + case 16: + cy = mulredc16(z, x, y, m, invm); + break; + case 17: + cy = mulredc17(z, x, y, m, invm); + break; + case 18: + cy = mulredc18(z, x, y, m, invm); + break; + case 19: + cy = mulredc19(z, x, y, m, invm); + break; + case 20: + cy = mulredc20(z, x, y, m, invm); + break; + default: + cy = mulredc20(z, x, y, m, invm); + } + return cy; +} + +void test(mp_size_t N, int k) +{ + mp_limb_t *x, *y, *yp, *z, *m, invm, cy, cy2, *tmp, *tmp2, *tmp3; + int i, j; + + x = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + y = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + z = (mp_limb_t *) malloc((N+1)*sizeof(mp_limb_t)); + m = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); + tmp = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + tmp2 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + tmp3 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); + + if (x == NULL || y == NULL || z == NULL || m == NULL || tmp == NULL || + tmp2 == NULL || tmp3 == NULL) + { + fprintf (stderr, "Cannot allocate memory in test_mulredc\n"); + exit (1); + } + + mpn_random2(m, N); + m[0] |= 1UL; + if (m[N-1] == 0) + m[N-1] = 1UL; + + invm = 1UL; + for (i = 0; i < 10; ++i) + invm = (2*invm-m[0]*invm*invm); + invm = -invm; + + assert( (invm*m[0] +1UL) == 0UL); + + yp = y; + for (i=0; i < k; ++i) { + /* Try a few special cases */ + if (i == 0) + { + /* Try all 0, product should be 0 */ + for (j = 0; j < N; j++) + x[j] = y[j] = 0; + } + else if (i == 1) + { + /* Try all 1 */ + for (j = 0; j < N; j++) + x[j] = y[j] = 1; + } + else if (i == 2) + { + /* Try all 2^wordsize - 1 */ + for (j = 0; j < N; j++) + x[j] = y[j] = ~(0UL); + } + else + { + /* In the other cases, try random data */ + if (i % 2 == 0) + { + /* Try squaring */ + mpn_random2(x, N); + yp = x; + } + else + { + /* Try multiplication */ + mpn_random2(x, N); + mpn_random2(y, N); + } + } + + // Mul followed by ecm_redc3 + mpn_mul_n(tmp, x, yp, N); + ecm_redc3(tmp, m, N, invm); + cy2 = mpn_add_n (tmp2, tmp + N, tmp, N); + + // Mixed mul and redc + cy = call_mulredc (N, z, x, yp, m, invm); + + if (cy != cy2) + printf ("i = %d: mulredc cy = %ld, mpn_mul_n/ecm_redc3 cy = %ld\n", + i, (long) cy, (long) cy2); + assert (cy == cy2); + if (mpn_cmp(z,tmp2, N) != 0) + { + printf ("i = %d\nmulredc = ", i); + for (j = N - 1; j >= 0; j--) + printf ("%lx ", z[j]); + printf ("\nmpn_mul_n/ecm_redc3 = "); + for (j = N - 1; j >= 0; j--) + printf ("%lx ", tmp2[j]); + printf ("\n"); + assert (mpn_cmp(z,tmp2, N) == 0); + } + + if (cy) + printf("!"); + z[N] = cy; + // Check with pure gmp : multiply by 2^(N*GMP_NUMB_BITS) and compare. + for (j=0; j < N; ++j) { + tmp[j] = 0; + tmp[j+N] = z[j]; + } + tmp[2*N] = z[N]; + mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N+1, m, N); + for (j=0; j < N; ++j) + z[j] = tmp3[j]; + + mpn_mul_n(tmp, x, yp, N); + mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N, m, N); + + assert(mpn_cmp(z, tmp3, N) == 0); + } + + free(tmp); free(tmp2); free(tmp3); + free(x); free(y); free(z); free(m); +} + + + +int main(int argc, char** argv) +{ + int i, len; + + if (argc > 1) /* Test a specific length */ + { + len = atoi (argv[1]); + for (i = 0; i < 1; i++) + test (len, 1000000); + return 0; + } + + for (;;) { + for (i = 1; i <= 20; ++i) { + test(i, 1000); + } +#if 0 + test(1, 1000); + test(2, 1000); + test(3, 1000); + test(4, 1000); + test(5, 1000); + test(6, 1000); + test(7, 1000); + test(8, 1000); + test(9, 1000); + test(10, 1000); + test(11, 1000); + test(12, 1000); + test(13, 100); + test(14, 100); + test(15, 100); + test(16, 100); + test(17, 100); + test(18, 100); + test(44, 10); + test(45, 10); + test(46, 10); + test(47, 10); + test(48, 10); + test(49, 10); +#endif + printf("."); fflush(stdout); + } +#if 0 + x[0] = 12580274668139321508UL; + x[1] = 9205793975152560417UL; + x[2] = 7857372727033793057UL; + + y[0] = 13688385828267279103UL; + y[1] = 10575011835742767258UL; + y[2] = 8802048318027595690UL; + + + m[0] = 2981542467342508025UL; + m[1] = 5964669706257742025UL; + m[2] = 18446744073678090270UL; + + invm = 9419286575570128311UL; + + carry = mulredc(z, x, y, m, 3, invm); + + printf("%lu + 2^64*(%lu + 2^64*%lu), carry=%lu\n", z[0], z[1], z[2], carry); +#endif + return 0; +} + + +#if 0 + +W := 2^64; + +x0:= 12580274668139321508; +x1:= 9205793975152560417; +x2:= 7857372727033793057; +x := x0 + W*(x1 + W*x2); + +y0:= 13688385828267279103; +y1:= 10575011835742767258; +y2:= 8802048318027595690; +y := y0 + W*(y1 + W*y2); + +m0:= 2981542467342508025; +m1:= 5964669706257742025; +m2:= 18446744073678090270; +m := m0 + W*(m1 + W*m2); + +invm := 9419286575570128311; + + + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj gmp-ecm-7.0.5+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,171 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {4727DE12-787D-432D-B166-BF103B0C3C87} + Win32Proj + bench_mulredc + 10.0 + + + + Application + true + v142 + + + Application + true + v142 + + + Application + false + true + v142 + + + Application + false + true + v142 + + + + + + + + + + + + + + + + + + + + + + + true + $(SolutionDir)..bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + true + $(SolutionDir)..bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreadedDebug + + + Console + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + + + Level3 + Disabled + _WIN64;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreadedDebug + + + Console + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreaded + + + Console + true + true + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + _WIN64;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\ + MultiThreaded + + + Console + true + true + true + psapi.lib;..\..\..\$(mp_dir)$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/bench_mulredc/bench_mulredc.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,23 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/bench_mulredc/Makefile.am gmp-ecm-7.0.5+ds/build.vs/bench_mulredc/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/bench_mulredc/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/bench_mulredc/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = bench_mulredc.vcxproj bench_mulredc.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/config.h gmp-ecm-7.0.5+ds/build.vs/config.h --- gmp-ecm-7.0.4+ds/build.vs/config.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/config.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,247 @@ +/* config.h.in. Generated from configure.in by autoheader. */ + +#define VERSION ECM_VERSION + +#define VERSION_GPU "gpu_ecm-win" + +#define PACKAGE_BUGREPORT "ecm-discuss@inria.fr" + +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP + systems. This function is required for `alloca.c' support on those systems. + */ +#undef CRAY_STACKSEG_END + +/* Define to 1 if using `alloca.c'. */ +#define C_ALLOCA 1 + +/* Define to 1 if you have the `access' function. */ +#undef HAVE_ACCESS + +/* Define to 1 if you have `alloca', as a function or macro. */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#undef HAVE_ALLOCA_H + +/* Define to 1 if you have the `ctime' function. */ +#define HAVE_CTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_CTYPE_H 1 + +/* Define to 1 if you have the `floor' function. */ +#define HAVE_FLOOR 1 + +/* Define to 1 if you have the `fmod' function. */ +#define HAVE_FMOD 1 + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `gettimeofday' function. */ +#undef HAVE_GETTIMEOFDAY + +/* Define to 1 if you have the header file. */ +#define HAVE_GMP_H 1 + +/* Define to 1 if gwnum.a or gwnum.lib exist */ +#undef HAVE_GWNUM + +/* Define to 1 if you have the header file. */ +#undef HAVE_INTTYPES_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_IO_H + +/* Define to 1 if you have the `isascii' function. */ +#undef HAVE_ISASCII + +/* Define to 1 if you have the `isdigit' function. */ +#define HAVE_ISDIGIT 1 + +/* Define to 1 if you have the `isspace' function. */ +#define HAVE_ISSPACE 1 + +/* Define to 1 if you have the `isxdigit' function. */ +#define HAVE_ISXDIGIT 1 + +/* Define to 1 if you have the `m' library (-lm). */ +#undef HAVE_LIBM + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_H 1 + +/* Define to 1 if you have the `malloc_usable_size' function. */ +#undef HAVE_MALLOC_USABLE_SIZE + +/* Define to 1 if you have the header file. */ +#define HAVE_MATH_H 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `nice' function. */ +#undef HAVE_NICE + +/* Define to 1 if you have the `pow' function. */ +#define HAVE_POW 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SIGNAL_H 1 + +/* Define to 1 if you have the `sqrt' function. */ +#define HAVE_SQRT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strlen' function. */ +#define HAVE_STRLEN 1 + +/* Define to 1 if you have the `strncasecmp' function. */ +#undef HAVE_STRNCASECMP + +/* Define to 1 if you have the `strstr' function. */ +#undef HAVE_STRSTR + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_RESOURCE_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TIME_H + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `time' function. */ +#undef HAVE_TIME + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Define to 1 if you have the `unlink' function. */ +#define HAVE_UNLINK 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_WINDOWS_H 1 + +/* Define to 1 if you have the `__gmpn_add_nc' function. */ +#if defined( _WIN64 ) +# define HAVE___GMPN_ADD_NC 1 +#endif + +/* Define to 1 if you have the `__gmpn_mod_34lsub1' function. */ +#define HAVE___GMPN_MOD_34LSUB1 1 + +/* Define to 1 if you have the `__gmpn_mul_fft' function. */ +#define HAVE___GMPN_MUL_FFT 1 + +/* Define to 1 if you want memory debugging */ +#undef MEMORY_DEBUG + +/* Define if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 +#define HAVE_LONG_LONG_INT 1 + +/* Define to 1 to use asm redc on x86 or x86_64 */ +# define NATIVE_REDC 1 + +/* Define to 1 if your C compiler doesn't accept -c and -o together. */ +#undef NO_MINUS_C_MINUS_O + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at runtime. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ +#undef STACK_DIRECTION + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#undef TIME_WITH_SYS_TIME + +/* Define to 1 if you want assertions enabled */ +#undef WANT_ASSERT + +/* Define to 1 if you want shell command execution */ +#undef WANT_SHELLCMD + +/* Define to empty if `const' does not conform to ANSI C. */ +#undef const + +/* How to specify hot-spot attribute, if available */ +#define ATTRIBUTE_HOT + +#define HAVE___GMPN_REDC_1 1 + +#define HAVE___GMPN_REDC_2 1 + +#define HAVE_ASM_REDC3 1 + +#define WINDOWS64_ABI 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#define inline __inline +#endif + +/* Define to `unsigned int' if does not define. */ +#undef size_t + +#define PRIdSIZE "Id" +#define PRIuSIZE "Iu" + +#ifdef _MSC_VER + +#define __func__ __FUNCTION__ + +/* define Windows tuning here */ +# define __tune_corei7__ + +# if _MSC_VER < 1600 +# define int64_t __int64 +# define uint64_t unsigned __int64 +# endif +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +# define access _access +# define alloca _alloca +# define fseek64 _fseek64 +# define ftell64 _ftell64 +# define omp_get_thread_limit omp_get_max_threads +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm/ecm.vcxproj gmp-ecm-7.0.5+ds/build.vs/ecm/ecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/ecm/ecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm/ecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,238 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} + ecm + Win32Proj + 10.0 + + + + Application + v142 + + + Application + v142 + + + Application + v142 + + + Application + v142 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + false + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + false + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm/ecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/ecm/ecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/ecm/ecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm/ecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,74 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm/Makefile.am gmp-ecm-7.0.5+ds/build.vs/ecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/ecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = ecm.vcxproj ecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj gmp-ecm-7.0.5+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,280 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {1B353D8B-9808-4EB3-A5E7-075D751757AD} + ecm_gpu + Win32Proj + 10.0 + + + + Application + v142 + + + Application + v142 + + + Application + v142 + + + Application + v142 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.5\lib\$(Platform)\cudart.lib + false + Console + true + true + false + + + NotSet + + + compute_50,sm_50 + + + 32 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + NotSet + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib\$(Platform)\cudart.lib + true + Console + false + + + NotSet + + + compute_50,sm_50 + + + 32 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0\lib\$(Platform)\cudart.lib + true + Console + false + + + NotSet + 8388608 + 65536 + + + compute_50,sm_50 + + + 64 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);..\..\lib\$(IntDir)libecm_gpu.lib;advapi32.lib;ws2_32.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.5\lib\$(Platform)\cudart.lib + true + + + compute_50,sm_50 + + + 64 + ..\;..\..\..\mpir\lib\$(IntDir) + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + true + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm_gpu/ecm_gpu.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,78 @@ + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + {2a13feaf-0c0e-469a-8047-82c647322da9} + + + {163547c7-89d7-4ddc-b0ad-02b4cfd722b4} + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm_gpu/Makefile.am gmp-ecm-7.0.5+ds/build.vs/ecm_gpu/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/ecm_gpu/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm_gpu/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = ecm_gpu.vcxproj ecm_gpu.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm_gpu.sln gmp-ecm-7.0.5+ds/build.vs/ecm_gpu.sln --- gmp-ecm-7.0.4+ds/build.vs/ecm_gpu.sln 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm_gpu.sln 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,39 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm_gpu", "libecm_gpu\libecm_gpu.vcxproj", "{3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm_gpu", "ecm_gpu\ecm_gpu.vcxproj", "{1B353D8B-9808-4EB3-A5E7-075D751757AD}" + ProjectSection(ProjectDependencies) = postProject + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} = {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|Win32.ActiveCfg = Debug|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|Win32.Build.0 = Debug|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|x64.ActiveCfg = Debug|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Debug|x64.Build.0 = Debug|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|Win32.ActiveCfg = Release|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|Win32.Build.0 = Release|Win32 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|x64.ActiveCfg = Release|x64 + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00}.Release|x64.Build.0 = Release|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|Win32.ActiveCfg = Debug|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|Win32.Build.0 = Debug|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|x64.ActiveCfg = Debug|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Debug|x64.Build.0 = Debug|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|Win32.ActiveCfg = Release|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|Win32.Build.0 = Release|Win32 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|x64.ActiveCfg = Release|x64 + {1B353D8B-9808-4EB3-A5E7-075D751757AD}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru gmp-ecm-7.0.4+ds/build.vs/ecm.sln gmp-ecm-7.0.5+ds/build.vs/ecm.sln --- gmp-ecm-7.0.4+ds/build.vs/ecm.sln 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/ecm.sln 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,70 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.24720.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm", "libecm\libecm.vcxproj", "{CD555681-D65B-4173-A29C-B8BF06A4871B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm", "ecm\ecm.vcxproj", "{C0E2EA85-996A-4B5F-AD30-590FAF5B7187}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tune", "tune\tune.vcxproj", "{80E08750-5C6C-492E-BB1E-7200978AE125}" + ProjectSection(ProjectDependencies) = postProject + {CD555681-D65B-4173-A29C-B8BF06A4871B} = {CD555681-D65B-4173-A29C-B8BF06A4871B} + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} = {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bench_mulredc", "bench_mulredc\bench_mulredc.vcxproj", "{4727DE12-787D-432D-B166-BF103B0C3C87}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "multiecm", "multiecm\multiecm.vcxproj", "{16434DC2-371C-451B-A336-820499B98B8C}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.ActiveCfg = Debug|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.Build.0 = Debug|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.ActiveCfg = Debug|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.Build.0 = Debug|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.ActiveCfg = Release|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.Build.0 = Release|Win32 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.ActiveCfg = Release|x64 + {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.Build.0 = Release|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.ActiveCfg = Debug|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.Build.0 = Debug|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.ActiveCfg = Debug|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.Build.0 = Debug|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.ActiveCfg = Release|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.Build.0 = Release|Win32 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.ActiveCfg = Release|x64 + {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.Build.0 = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|Win32.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.Build.0 = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.ActiveCfg = Release|Win32 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.Build.0 = Release|Win32 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.ActiveCfg = Release|x64 + {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.Build.0 = Release|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.ActiveCfg = Debug|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.Build.0 = Debug|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.ActiveCfg = Debug|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.Build.0 = Debug|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.ActiveCfg = Release|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.Build.0 = Release|Win32 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.ActiveCfg = Release|x64 + {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.Build.0 = Release|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|Win32.ActiveCfg = Debug|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|Win32.Build.0 = Debug|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|x64.ActiveCfg = Debug|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Debug|x64.Build.0 = Debug|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|Win32.ActiveCfg = Release|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|Win32.Build.0 = Release|Win32 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|x64.ActiveCfg = Release|x64 + {16434DC2-371C-451B-A336-820499B98B8C}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff -Nru gmp-ecm-7.0.4+ds/build.vs/file_copy.bat gmp-ecm-7.0.5+ds/build.vs/file_copy.bat --- gmp-ecm-7.0.4+ds/build.vs/file_copy.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/file_copy.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,4 @@ +if not exist %1 ( echo file_copy failure: %1 not found && goto exit ) +if exist %2 ( fc %1 %2 > nul && if not %errorlevel 1 goto exit ) +echo copying %1 to %2 && copy %1 %2 +:exit diff -Nru gmp-ecm-7.0.4+ds/build.vs/gen_ecm_h.bat gmp-ecm-7.0.5+ds/build.vs/gen_ecm_h.bat --- gmp-ecm-7.0.4+ds/build.vs/gen_ecm_h.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/gen_ecm_h.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,13 @@ +@echo off +echo creating ecm.h from ecm.h.in +echo /* generated from ecm-h.in by gen_ecm_h.bat */>tmp.h + +for /f "tokens=1,2*" %%a in (..\ecm.h.in) do ( + if "%%a" EQU "#undef" ( + if "%%b" EQU "ECM_VERSION" ( + echo #define ECM_VERSION "7.0.5-dev">>tmp.h + ) + ) else echo %%a %%b %%c>>tmp.h +) + +call out_copy_rename tmp.h ..\ ecm.h diff -Nru gmp-ecm-7.0.4+ds/build.vs/getopt.c gmp-ecm-7.0.5+ds/build.vs/getopt.c --- gmp-ecm-7.0.4+ds/build.vs/getopt.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/getopt.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,1281 @@ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001,2002 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ + +#define HAVE_STRING_H 1 + +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + +#ifdef HAVE_CONFIG_H +# include +#endif + +#if !defined __STDC__ || !__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. */ +# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC +# include +# ifndef _ +# define _(msgid) gettext (msgid) +# endif +# else +# define _(msgid) (msgid) +# endif +# if defined _LIBC && defined USE_IN_LIBIO +# include +# endif +#endif + +#ifndef attribute_hidden +# define attribute_hidden +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + +#include "getopt.h" + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized attribute_hidden; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# include +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined __STDC__ || !__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not __STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Stored original parameters. + XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ +extern int __libc_argc; +extern char **__libc_argv; + +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +# ifdef USE_NONOPTION_FLAGS +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; +# endif + +# ifdef USE_NONOPTION_FLAGS +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +# else +# define SWAP_FLAGS(ch1, ch2) +# endif +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined __STDC__ && __STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined __STDC__ && __STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#if defined _LIBC && defined USE_NONOPTION_FLAGS + if (posixly_correct == NULL + && argc == __libc_argc && argv == __libc_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int *longind; + int long_only; +{ + int print_errors = opterr; + if (optstring[0] == ':') + print_errors = 0; + + if (argc < 1) + return -1; + + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#if defined _LIBC && defined USE_NONOPTION_FLAGS +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else if (long_only + || pfound->has_arg != p->has_arg + || pfound->flag != p->flag + || pfound->val != p->val) + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]) >= 0) + { + + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); +#endif + } + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (argv[optind - 1][1] == '-') + { + /* --option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("\ +%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); +#else + fprintf (stderr, _("\ +%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); +#endif + } + else + { + /* +option or -option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("\ +%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], + pfound->name); +#else + fprintf (stderr, _("\ +%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); +#endif + } + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (argv[optind][1] == '-') + { + /* --option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); +#else + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); +#endif + } + else + { + /* +option or -option */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); +#else + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + int n; +#endif + + if (posixly_correct) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: illegal option -- %c\n"), + argv[0], c); +#else + fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c); +#endif + } + else + { +#if defined _LIBC && defined USE_IN_LIBIO + n = __asprintf (&buf, _("%s: invalid option -- %c\n"), + argv[0], c); +#else + fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c); +#endif + } + +#if defined _LIBC && defined USE_IN_LIBIO + if (n >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#endif + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, + _("%s: option requires an argument -- %c\n"), + argv[0], c) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); +#endif + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); +#endif + } + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); +#endif + } + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + { +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); +#endif + } + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ +#if defined _LIBC && defined USE_IN_LIBIO + char *buf; + + if (__asprintf (&buf, _("\ +%s: option requires an argument -- %c\n"), + argv[0], c) >= 0) + { + if (_IO_fwide (stderr, 0) > 0) + __fwprintf (stderr, L"%s", buf); + else + fputs (buf, stderr); + + free (buf); + } +#else + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); +#endif + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int *) 0, + 0); +} + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +/* Compile with -DTEST to make an executable for use in testing + the above definition of `getopt'. */ + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + + c = getopt (argc, argv, "abc:d:0123456789"); + if (c == -1) + break; + + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ + diff -Nru gmp-ecm-7.0.4+ds/build.vs/getopt.h gmp-ecm-7.0.5+ds/build.vs/getopt.h --- gmp-ecm-7.0.4+ds/build.vs/getopt.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/getopt.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,182 @@ +/* Declarations for getopt. + Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _GETOPT_H + +#ifndef __need_getopt +# define _GETOPT_H 1 +#endif + +/* If __GNU_LIBRARY__ is not already defined, either we are being used + standalone, or this is the first header included in the source file. + If we are being used with glibc, we need to include , but + that does not exist if we are standalone. So: if __GNU_LIBRARY__ is + not defined, include , which will pull in for us + if it's from glibc. (Why ctype.h? It's guaranteed to exist and it + doesn't flood the namespace with stuff the way some other headers do.) */ +#if !defined __GNU_LIBRARY__ +# include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#ifndef __need_getopt +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +# if (defined __STDC__ && __STDC__) || defined __cplusplus + const char *name; +# else + char *name; +# endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +# define no_argument 0 +# define required_argument 1 +# define optional_argument 2 +#endif /* need getopt */ + + +/* Get definitions and prototypes for functions to process the + arguments in ARGV (ARGC of them, minus the program name) for + options given in OPTS. + + Return the option character from OPTS just read. Return -1 when + there are no more options. For unrecognized options, or options + missing arguments, `optopt' is set to the option letter, and '?' is + returned. + + The OPTS string is a list of characters which are recognized option + letters, optionally followed by colons, specifying that that letter + takes an argument, to be placed in `optarg'. + + If a letter in OPTS is followed by two colons, its argument is + optional. This behavior is specific to the GNU `getopt'. + + The argument `--' causes premature termination of argument + scanning, explicitly telling `getopt' that there are no more + options. + + If OPTS begins with `--', then non-option arguments are treated as + arguments to the option '\0'. This behavior is specific to the GNU + `getopt'. */ + +#if (defined __STDC__ && __STDC__) || defined __cplusplus +# ifdef __GNU_LIBRARY__ +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int ___argc, char *const *___argv, const char *__shortopts); +# else /* not __GNU_LIBRARY__ */ +extern int getopt (); +# endif /* __GNU_LIBRARY__ */ + +# ifndef __need_getopt +extern int getopt_long (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); +extern int getopt_long_only (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind, + int __long_only); +# endif +#else /* not __STDC__ */ +extern int getopt (); +# ifndef __need_getopt +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +# endif +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +/* Make sure we later can get all the definitions and declarations. */ +#undef __need_getopt + +#endif /* getopt.h */ + diff -Nru gmp-ecm-7.0.4+ds/build.vs/getrusage.c gmp-ecm-7.0.5+ds/build.vs/getrusage.c --- gmp-ecm-7.0.4+ds/build.vs/getrusage.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/getrusage.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,71 @@ +/* This file is part of the MPIR Library. + + The MPIR Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, or (at + your option) any later version. + The MPIR Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + You should have received a copy of the GNU Lesser General Public License + along with the MPIR Library; see the file COPYING.LIB. If not, write + to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#define WIN32_LEAN_AND_MEAN + +#include +#include +#include + +#include "getrusage.h" + +typedef union file_t +{ FILETIME ft; + long long lt; +} file_t; + +int getrusage(int who, rusage *usage) +{ + HANDLE proc_hand; + file_t c_time, x_time, s_time, u_time; + int cb = 0, err = -1; + + if(who != RUSAGE_SELF) + { + errno = (who == RUSAGE_CHILDREN ? ENODATA : EINVAL); + return err; + } + + proc_hand = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, GetCurrentProcessId()); + + if(GetProcessTimes(proc_hand, &(c_time.ft), &(x_time.ft), &(s_time.ft), &(u_time.ft))) + { + PROCESS_MEMORY_COUNTERS ctrs; + + /* The units returned by GetProcessTimes are 100 nanoseconds */ + u_time.lt = (u_time.lt + 5) / 10; + s_time.lt = (s_time.lt + 5) / 10; + + usage->ru_utime.tv_sec = (long)(u_time.lt / 1000000ll); + usage->ru_stime.tv_sec = (long)(s_time.lt / 1000000ll); + usage->ru_utime.tv_usec = (long)(u_time.lt % 1000000ll); + usage->ru_stime.tv_usec = (long)(s_time.lt % 1000000ll); + + if(GetProcessMemoryInfo(proc_hand, &ctrs, sizeof(ctrs))) + { + PERFORMANCE_INFORMATION perf_info; + GetPerformanceInfo(&perf_info, sizeof(perf_info)); + usage->ru_maxrss = (DWORD) (ctrs.WorkingSetSize / perf_info.PageSize); + usage->ru_majflt = ctrs.PageFaultCount; + err = 0; + } + } + + if(err) + errno = EACCES; + CloseHandle(proc_hand); + return err; +} diff -Nru gmp-ecm-7.0.4+ds/build.vs/getrusage.h gmp-ecm-7.0.5+ds/build.vs/getrusage.h --- gmp-ecm-7.0.4+ds/build.vs/getrusage.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/getrusage.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,46 @@ + +#ifndef _GETRUSAGE_H +#define _GETRUSAGE_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define ENODATA 61 +#define RUSAGE_SELF 0 +#define RUSAGE_CHILDREN -1 + +typedef struct +{ + long tv_sec; + long tv_usec; +} tval; + +typedef struct rusage +{ + tval ru_utime; /* user time used */ + tval ru_stime; /* system time used */ + long ru_maxrss; /* integral max resident set size */ + long ru_ixrss; /* integral shared text memory size */ + long ru_idrss; /* integral unshared data size */ + long ru_isrss; /* integral unshared stack size */ + long ru_minflt; /* page reclaims */ + long ru_majflt; /* page faults */ + long ru_nswap; /* swaps */ + long ru_inblock; /* block input operations */ + long ru_oublock; /* block output operations */ + long ru_msgsnd; /* messages sent */ + long ru_msgrcv; /* messages received */ + long ru_nsignals;/* signals received */ + long ru_nvcsw; /* voluntary context switches */ + long ru_nivcsw; /* involuntary context switches */ +} rusage; + +int getrusage(int who, rusage *usage); + +#if defined(__cplusplus) +} +#endif + +#endif diff -Nru gmp-ecm-7.0.4+ds/build.vs/gettimeofday.c gmp-ecm-7.0.5+ds/build.vs/gettimeofday.c --- gmp-ecm-7.0.4+ds/build.vs/gettimeofday.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/gettimeofday.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,39 @@ + +#define WIN32_LEAN_AND_MEAN +#include +#include + +#include "gettimeofday.h" + +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + FILETIME ft; + LARGE_INTEGER li; + __int64 t; + static int tzflag; + + if(tv) + { + GetSystemTimeAsFileTime(&ft); + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + t = li.QuadPart; + t -= EPOCHFILETIME; + t /= 10; + tv->tv_sec = (long)(t / 1000000); + tv->tv_usec = (long)(t % 1000000); + } + + if (tz) + { + if (!tzflag) + { + _tzset(); + tzflag++; + } + tz->tz_minuteswest = _timezone / 60; + tz->tz_dsttime = _daylight; + } + + return 0; +} diff -Nru gmp-ecm-7.0.4+ds/build.vs/gettimeofday.h gmp-ecm-7.0.5+ds/build.vs/gettimeofday.h --- gmp-ecm-7.0.4+ds/build.vs/gettimeofday.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/gettimeofday.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,34 @@ +/* + * timeval.h 1.0 01/12/19 + * + * Defines gettimeofday, timeval, etc. for Win32 + * + * By Wu Yongwei + * + */ +#ifndef _TIMEVAL_H +#define _TIMEVAL_H + +#include + +#define EPOCHFILETIME (116444736000000000LL) + +#if defined(__cplusplus) +extern "C" +{ +#endif + +struct timezone +{ + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +int gettimeofday(struct timeval *tv, struct timezone *tz); + +#if defined(__cplusplus) +} +#endif + +#endif /* _TIMEVAL_H */ + diff -Nru gmp-ecm-7.0.4+ds/build.vs/libecm/libecm.vcxproj gmp-ecm-7.0.5+ds/build.vs/libecm/libecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/libecm/libecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/libecm/libecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,253 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {CD555681-D65B-4173-A29C-B8BF06A4871B} + libecm + Win32Proj + 10.0 + + + + StaticLibrary + v142 + + + StaticLibrary + v142 + + + StaticLibrary + v142 + + + StaticLibrary + Static + v142 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + ..\prebuild NO_GPU + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + Default + true + + + + $(IntDir)%(Filename).obj + + + + + ..\prebuild NO_GPU + + + X64 + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + + + Level3 + Default + true + + + + + _WIN64 + $(IntDir)%(Filename).obj + + + + + ..\prebuild NO_GPU + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + $(IntDir)%(Filename).obj + + + + + ..\prebuild NO_GPU + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + Default + true + + + + + _WIN64 + $(IntDir)%(Filename).obj + + + + + + + + + + + + + + + + + + + + + + + true + true + true + true + + + + + + + + + + + + + + + + + + + Full + + + Full + + + Full + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/libecm/libecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/libecm/libecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/libecm/libecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/libecm/libecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,190 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {2f18179f-5dba-420c-8dc7-bc7f8228a1b2} + + + + + Source Files\Assembler + + + Source Files\Assembler + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/libecm/Makefile.am gmp-ecm-7.0.5+ds/build.vs/libecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/libecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/libecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = libecm.vcxproj libecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj gmp-ecm-7.0.5+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,313 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {3DCDD03F-7F5E-4C3C-BC19-F5C6531EAC00} + libecm_gpu + Win32Proj + 10.0 + + + + StaticLibrary + v142 + + + StaticLibrary + v142 + + + StaticLibrary + v142 + + + StaticLibrary + Static + v142 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + ecmlib + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + + ..\prebuild GPU + + + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + + + Level3 + Default + Full + + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 32 + + + true + + + + + ..\prebuild GPU + + + X64 + + + Full + true + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + + + Level3 + Default + true + $(IntDir)vc$(PlatformToolsetVersion).pdb + + + + + _WIN64 + + + compute_50,sm_50 + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 64 + + + true + + + + + + + + + ..\prebuild GPU + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + Default + true + + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 32 + + + true + + + + + ..\prebuild GPU + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;WITH_GPU;GPU_CC50;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) + true + EnableFastChecks + + + Level3 + Default + true + + + + + _WIN64 + + + compute_50,sm_50 + + + true + + + ..\;..\..\..\mpir\lib\$(IntDir) + 64 + + + true + + + + + true + true + true + + + + + + + + + + + + + + + + true + true + true + true + + + + + + + + + + + + + + + + + + + Full + + + Full + + + Full + + + + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + Document + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/libecm_gpu/libecm_gpu.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,196 @@ + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + {dfe792df-b4ff-4147-be95-190117baae33} + + + {0315d9d5-3f8f-456a-ae54-e00de69b9350} + + + {cbe6b893-95dc-4f4b-b2e9-73245cf57c75} + + + + + Source Files + + + Source Files + + + + + Source Files\Assembler + + + Source Files\Assembler + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/libecm_gpu/Makefile.am gmp-ecm-7.0.5+ds/build.vs/libecm_gpu/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/libecm_gpu/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/libecm_gpu/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = libecm_gpu.vcxproj libecm_gpu.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/Makefile.am gmp-ecm-7.0.5+ds/build.vs/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,7 @@ +EXTRA_DIST = config.h ecm.sln ecm_gpu.sln file_copy.bat gen_ecm_h.bat \ + getopt.c getopt.h getrusage.c getrusage.h gettimeofday.c \ + gettimeofday.h mp_lib.props out_copy_rename.bat prebuild.bat \ + python.bat readme.txt tests.py vacopy.c vsyasm.props \ + vsyasm.targets vsyasm.xml + +DIST_SUBDIRS = assembler ecm ecm_gpu libecm libecm_gpu tune bench_mulredc diff -Nru gmp-ecm-7.0.4+ds/build.vs/mp_lib.dll.props gmp-ecm-7.0.5+ds/build.vs/mp_lib.dll.props --- gmp-ecm-7.0.4+ds/build.vs/mp_lib.dll.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/mp_lib.dll.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,25 @@ + + + + mpir\dll\ + mpir.lib + + + <_ProjectFileVersion>10.0.30128.1 + + + + __GMP_LIBGMP_DLL;%(PreprocessorDefinitions) + + + + + $(mp_dir) + true + + + $(mp_lib) + true + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/mp_lib.props gmp-ecm-7.0.5+ds/build.vs/mp_lib.props --- gmp-ecm-7.0.4+ds/build.vs/mp_lib.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/mp_lib.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,27 @@ + + + + mpir\lib\ + mpir.lib + + + <_ProjectFileVersion>10.0.30128.1 + + + + + + MultiThreaded + + + + + $(mp_dir) + true + + + $(mp_lib) + true + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/multiecm/Makefile.am gmp-ecm-7.0.5+ds/build.vs/multiecm/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/multiecm/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/multiecm/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = multiecm.vcxproj multiecm.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/multiecm/multiecm.vcxproj gmp-ecm-7.0.5+ds/build.vs/multiecm/multiecm.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/multiecm/multiecm.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/multiecm/multiecm.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,238 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + multiecm + Win32Proj + 10.0 + {16434DC2-371C-451B-A336-820499B98B8C} + + + + Application + v142 + + + Application + v142 + + + Application + v142 + + + Application + v142 + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + + Full + true + Speed + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + false + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Full + true + Speed + ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + Default + true + + + ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) + Console + true + true + false + + + MachineX64 + 8388608 + 65536 + + + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + Default + true + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + Console + false + + + MachineX64 + 8388608 + 65536 + + + + + ..\;..\..\;..\assembler;..\..\..\$(mp_dir)$(IntDir);%(AdditionalIncludeDirectories) + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + false + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/multiecm/multiecm.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/multiecm/multiecm.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/multiecm/multiecm.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/multiecm/multiecm.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,71 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/out_copy_rename.bat gmp-ecm-7.0.5+ds/build.vs/out_copy_rename.bat --- gmp-ecm-7.0.4+ds/build.vs/out_copy_rename.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/out_copy_rename.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,31 @@ +@echo off +if not exist %1 goto nofile +if exist %2 goto next + +echo creating directory %2 +md %2 > nul + +:next +rem strip quotes if present +set str=%2 +for /f "useback tokens=*" %%a in ('%str%') do set str=%%~a + +rem add a backslash if the output directory lacks one +set str=%str:~-1% +if "%str%" == "\" (set outf=%2%3) else (set outf=%2\%3) + +echo copying %1 to %outf% (if not present or changed) +if not exist "%outf%" goto copy + +rem don't overwrite if output exists and is not changed +fc %1 %outf% > nul && if not %errorlevel 1 goto exit +echo overwriting %outf% with %1 + +:copy +copy %1 %outf% > nul +goto exit + +:nofile +echo %1 not found + +:exit diff -Nru gmp-ecm-7.0.4+ds/build.vs/prebuild.bat gmp-ecm-7.0.5+ds/build.vs/prebuild.bat --- gmp-ecm-7.0.4+ds/build.vs/prebuild.bat 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/prebuild.bat 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +cd ..\ +call out_copy_rename config.h ..\ config.h +call gen_ecm_h diff -Nru gmp-ecm-7.0.4+ds/build.vs/readme.txt gmp-ecm-7.0.5+ds/build.vs/readme.txt --- gmp-ecm-7.0.4+ds/build.vs/readme.txt 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/readme.txt 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,204 @@ + +Building GMP-ECM with Microsoft Visual Studio +============================================= + +If you wish to build the assembler code support you will need to +install the YASM assembler that is available at: + + http://www.tortall.net/projects/yasm/ + +THe version you need is vsyasm, which should be put it in the directory + + C:\Program Files\yasm + +Alternatively vsyasm can be installed anywhere provided the environment +variable YASMPATH is set to its absolute file path. + +The Multi-Precision Library - GMP and MPIR +========================================== + +GMP-ECM works with either GMP or MPIR, a fork of GMP. To build and run +GMP-ECM using Visual Studio you first need to obtain and build either +GMP or MPIR. MPIR has a fully integrated Visual Studio build system +for Windows but GMP does not. + +The VC++ build of GMP-ECM now defaults to MPIR but the property sheet +mp_lib.vsprops can be edited to set the macro mp_lib to 'gmp' instead +of 'mpir' to build ECM using GMP. + +GMP +=== + +GMP can be built from the GMP source code available here: + + http://gmplib.org/ + +GMP can be built with mingw for 32-bit Windows and mingw64 for Windows x64. +It is reported that the resulting libraries work with Visual Studio when +appropriately renamed. + +MPIR +==== + +MPIR is available here: + + http://www.mpir.org + +It has full support for building MPIR for 32 and 64 bit Windows systems +with x86 assembler support using the YASM assembler. + +Building GMP-ECM +================ + +The build files for GMP-ECM assume that the GMP and ECM build directories +are in a common parent directory as follows: + + Parent Directory + MPIR + msvc\vs -- MPIR build files (nn = Viual Studio 2 digit version) + ... + GMP-ECM + buid.vs -- ECM build files + +The root directories for GMP and GMP-ECM are assumed to have these names +irrespective of which version is being used (they used to be followed by +version numbers but this meant that the build projects had to be updated +too frequently). + +The normal (non GPU) build is opened by loading the file ecm.sln (from +the build.vc14 directory) into Visual Studio. This provides these build +projects in build.vc15 for the non GPU build: + + ecm - the ECM application + ecmlib - the ECM library + tune - a program for tuning + bench_mulredc - for benchmarking mulredc + multiecm - work in progress (not working) + +The GPU build is opened by loading the file ecm.sln (from the build.vc14 +directory) into Visual Studio. This provides two build projects in +build.vc15: + + ecm_gpu - the ECM application + ecmlib_gpu - the ECM library + +In all cases you have to choose either a win32 or x64 build and either a +Release or Debug configuration (however the win32 builds are no longer +actively supported and may not work). + +The non GPU Build +----------------- + +Before starting a build, there are a number of configuration options +that need to be set: + +1. If you wish to compile GMP-ECM for use on a particular processor, + select the appropriate define from the file 'ecm-params.h' in the + GMP-ECM root directory and decide which of the defines suit your + needs (e.g. __tune_corei7__). Then replace the existing define: + + /* define Windows tuning here */ + # define __tune_corei7__ + + towards the end of the file config.h file in the 'build.vc14' + directory (build.vc14\config.h) with the chosen define. + +2. The file at 'build.vc14\mul_fft-params.h' allows the FFT code to + be tuned to 32 or 64-bit systems by selecting an option by + changing the appropriate '#elif 0' to #elif 1'. If you wish to + use the win32 AMD assembler files, you also have to use the + Visual Studio property page to define AMD_ASM (alternatively + you can edit the two files mulredc.asm and redc.asm in the + build.vc14\assembler\ directory to include the AMD assembler). + +The GPU Build +------------- + +1. If you wish to build with a GPU capability you will need to + install Nvidia Nsight for Visual Studio version 5.4 and the + CUDA Toolkit v9.0. You can then build the libecm_gpu and + ecm_gpu projects + +2. The choices above for the non GPU build aslo apply when + building for a GPU based system. + + By default, the GPU configuration is "compute_50,sm_50". If + you need to change this, select libecm_gpu and ecm_gpu and + set the propertiesfor "CUDA C/C++|Device|Code Generation" for + your GPU capability. + + Also under "C/C++|Preprocessor|Preprocessor Definitions" for + both these projects, change the current definition GPU_CC50 to + that for your GPU capability + +Build Configurations +-------------------- + +When a version of ecm and ecmlib are built, the library and the application +are put in the directory matching the configuration that has been built: + + GMP-ECM + build.vc15 -- ECM build files + lib -- ECM static library files + bin -- ECM executable files + +within these lib, dll and bin directories, the outputs are located in +sub-directories determined by the platform and configuration: + + win32\release + win32\debug + x64\release + x64\debug + +If you don't want assembler support you need to change the define: + +#define NATIVE_REDC 1 + +in config.h (in the build.vc14 subdirectory) to: + +#undef NATIVE_REDC + +Tune +==== + +If tune is compiled and run for a particular configuration it will output +suitable values for optimising GMP-ECM to the console window. To optimise +GMP-ECM these values should be put in a suitably named file whcih then has +to be integrated in ecm-params.h. + +Tests +===== + +The file test.py is a python script that runs the ECM tests. It runs the +x64/release-amd (non GPU) version by default but can be edited to test other +builds. It cannot run some tests as a result of the diifficulty in the +conversion of the Unix shell scripts for the tests for use on Windows. + +Running the tests with bash +=========================== + +It is possible to run the tests with the Windows version of bash which is +available here: + + http://win-bash.sourceforge.net/ + +The bash executable needs to be obtained and placed in the gmp-ecm root +directory alongside the existing test files which have the names test.ext +or testlong.ext where 'ext' is the name of the test (for example test.pp1). + +With a Windows command prompt opened in the gmp-ecm root directory, the +commands to run one of the tests on the x64|Release version of gmp-ecm is: + + bash test.ext bin\x64\Release\ecm.exe + + and: + + bash testlong.ext bin\x64\Release\ecm.exe + +where 'ext' is the filename extension of the test required. + +To test the GPU version the command is: + + bash test.gpuecm bin\x64\Release\ecm_gpu.exe + + Brian Gladman, 1st October 2019 diff -Nru gmp-ecm-7.0.4+ds/build.vs/tests.py gmp-ecm-7.0.5+ds/build.vs/tests.py --- gmp-ecm-7.0.4+ds/build.vs/tests.py 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/tests.py 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,150 @@ + +from __future__ import print_function +import os +import sys +import string +import platform +from re import match +from subprocess import Popen, PIPE, STDOUT +from tempfile import * + +try: + from time import perf_counter as timer +except ImportError: + from time import clock as timer + + +x64 = True +debug = False +test_gpu_version = True +run_non_gpu_tests = True +run_gpu_tests = True + +class Timer() : + def __enter__(self): self.start = timer() + def __exit__(self, *args): print(' time {:.3f} milliseconds'.format(1000 * (timer() - self.start))) + +cpath = os.path.dirname(__file__) +config = 'x64' if x64 else 'Win32' +mode = 'Debug' if debug else 'Release' +test_dir = '..\\bin\\{:s}\\{:s}\\'.format(config, mode) + +def get_tests(filename): + print('running tests in {:s}'.format(filename)) + start, sub, tests, c_tests = True, dict(), [], [] + with open(os.path.join(cpath, filename)) as f: + lines = f.readlines() + cnt, lnth = 0, len(lines) + while cnt < lnth: + try: + line = lines[cnt].strip() + cnt += 1 + tkns = line.split() + if line.startswith('echo') and len(tkns) > 2 and tkns[2] == '|': + while cnt < lnth and 'checkcode' not in line: + while cnt < lnth and not lines[cnt]: + cnt += 1 + if cnt < lnth: + line += '|' + lines[cnt] + cnt += 1 + start = False + elif start: + sp = line.split('="') + if len(sp) == 2: + if sp[1].startswith('${1:-./ecm}'): + sub[sp[0]] = sp[1][12:-1] + else: + sub[sp[0]] = sp[1][:-1] + continue + else: + continue + line = line.replace(';', '|') + sub_tests = [] + for line_part in line.split('|'): + tkns = line_part.strip().split() + cmd = [] + for tok in tkns: + if tok.startswith('"') and tok.endswith('"'): + tok = tok[1:-1] + if tok[0] == '$' and tok[1:] in sub: + tok = tok.replace(tok, sub[tok[1:]]) + elif tok == './ecm': + tok = '' + cmd += [tok] + cseq = [] + if cmd and cmd[0] == 'echo': + cseq += [cmd[1]] + cmd = cmd[2:] + if len(cmd) >= 3 and cmd[-3] == 'checkcode' and cmd[-2] == '$?': + cseq += [int(cmd[-1])] + cmd = cmd[:-3] + cmd = (' '.join(cmd)).strip() + if cmd: + cseq += [cmd] + sub_tests += [cseq] + if len(sub_tests) == 3 and all(len(x) == 1 for x in sub_tests): + tests += [tuple(x[0] for x in sub_tests)] + else: + c_tests += [sub_tests] + except ValueError: + print('parsing error on line {} in text "{}"'.format(cnt, line)) + return tests, c_tests + +def run_exe(exe, args, inp) : + al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT } + if sys.platform.startswith('win'): + al['creationflags'] = 0x08000000 + p = Popen([os.path.join(cpath, exe)] + args.split(' '), **al) + res = p.communicate(inp.encode())[0].decode() + ret = p.poll() + return (ret, res) + +def output_complex_tests(x): + print('these tests are too complex:') + for t in x: + print(t) + +def do_tests(tests, ctests, out=False, gpu=False): + ecm_exe = test_dir + ("ecm_gpu.exe" if gpu else "ecm.exe") + err_cnt = 0 + for ix, tt in enumerate(tests): + print(tt[1], tt[0], end='') + rv = run_exe(ecm_exe, tt[1], tt[0]) + if type(tt[2]) == int and rv[0] != tt[2]: + print(" - *** ERROR in test {:d}: {:d} {:d} ***".format(ix, rv[0], tt[2])) + err_cnt += 1 + elif type(tt[2]) == tuple and rv[0] != tt[2][0] and rv[0] != tt[2][1]: + print(" - *** ERROR in test {:d}: {:d} {:s} ***".format(ix, rv[0], tt[2])) + err_cnt += 1 + else: + print(" - passed") + if out: + op = rv[1].rsplit('\r\n') + for i in op : + print(i) + + if ctests: + output_complex_tests(ctests) + if not err_cnt: + if ctests: + print('all other tests passed') + else: + print('all tests passed') + +with Timer(): + if os.path.exists('test.pm1.save'): + os.remove('test.pm1.save') + if run_non_gpu_tests: + t, ct = get_tests("..\\test.ecm") + do_tests(t, ct) + t, ct = get_tests("..\\test.pm1") + do_tests(t, ct) + t, ct = get_tests("..\\test.pp1") + do_tests(t, ct) + t, ct = get_tests("..\\testlong.pp1") + do_tests(t, ct) + t, ct = get_tests("..\\testlong.pm1") + do_tests(t, ct) + if run_gpu_tests: + t, ct = get_tests("..\\test.gpuecm") + do_tests(t, ct, gpu=True) diff -Nru gmp-ecm-7.0.4+ds/build.vs/tune/Makefile.am gmp-ecm-7.0.5+ds/build.vs/tune/Makefile.am --- gmp-ecm-7.0.4+ds/build.vs/tune/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/tune/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +EXTRA_DIST = tune.vcxproj tune.vcxproj.filters diff -Nru gmp-ecm-7.0.4+ds/build.vs/tune/tune.vcxproj gmp-ecm-7.0.5+ds/build.vs/tune/tune.vcxproj --- gmp-ecm-7.0.4+ds/build.vs/tune/tune.vcxproj 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/tune/tune.vcxproj 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,164 @@ + + + + + Release + Win32 + + + Release + x64 + + + + {80E08750-5C6C-492E-BB1E-7200978AE125} + tune + Win32Proj + 10.0 + + + + Application + Unicode + true + v142 + + + Application + NotSet + v142 + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + + + + MaxSpeed + true + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);%(AdditionalDependencies) + true + Console + true + true + MachineX86 + + + + + + + + + $(IntDir)%(FileName).obj + + + + + X64 + + + MaxSpeed + true + ..\..\..\$(mp_dir)$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) + WIN32;_WIN64;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) + MultiThreaded + true + + + Level3 + ProgramDatabase + + + ..\..\..\$(mp_dir)$(IntDir)$(mp_lib);%(AdditionalDependencies) + true + Console + true + true + MachineX64 + + + + + + + _WIN64 + $(IntDir)%(FileName).obj + + + + + + + + + + + + + + + TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) + TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) + + + + + + + + + + + + + + + + + + + + + + + + + + + {cd555681-d65b-4173-a29c-b8bf06a4871b} + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/tune/tune.vcxproj.filters gmp-ecm-7.0.5+ds/build.vs/tune/tune.vcxproj.filters --- gmp-ecm-7.0.4+ds/build.vs/tune/tune.vcxproj.filters 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/tune/tune.vcxproj.filters 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,109 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {38f1a18f-40fc-4eed-a68e-e79b58327b6c} + + + + + Source Files\Assembler + + + Source Files\Assembler + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/vacopy.c gmp-ecm-7.0.5+ds/build.vs/vacopy.c --- gmp-ecm-7.0.4+ds/build.vs/vacopy.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/vacopy.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,8 @@ + +#include +#include + +void _vacopy(va_list *pap, va_list ap) +{ + *pap = ap; +} diff -Nru gmp-ecm-7.0.4+ds/build.vs/vsyasm.props gmp-ecm-7.0.5+ds/build.vs/vsyasm.props --- gmp-ecm-7.0.4+ds/build.vs/vsyasm.props 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/vsyasm.props 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,27 @@ + + + + Midl + CustomBuild + + + _SelectedFiles;$(YASMDependsOn) + + + C:\Program Files\yasm\ + + + + False + $(IntDir)%(FileName).obj + 0 + 0 + "$(YASM_PATH)"vsyasm.exe -Xvc -f $(Platform) [AllOptions] [AdditionalOptions] [Inputs] + %(ObjectFile) + Assembling %(Filename)%(Extension) ==> $(IntDir)%(FileName).obj + false + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/vsyasm.targets gmp-ecm-7.0.5+ds/build.vs/vsyasm.targets --- gmp-ecm-7.0.4+ds/build.vs/vsyasm.targets 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/vsyasm.targets 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,110 @@ + + + + + + + _YASM + + + + + + $(ComputeLinkInputsTargets); + ComputeYASMOutput; + + + $(ComputeLibInputsTargets); + ComputeYASMOutput; + + + + + $(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml + + + + + + <_YASMReadTlog + Include="^%(YASM.FullPath);%(YASM.AdditionalDependencies)" + Condition="'%(YASM.ExcludedFromBuild)' != 'true' and '%(YASM.ObjectFile)' != ''"/> + <_YASMWriteTlog + Include="^%(YASM.FullPath);$([MSBuild]::NormalizePath('$(MSBuildProjectDirectory)', '%(YASM.ObjectFile)'))" + Condition="'%(YASM.ExcludedFromBuild)' != 'true' and '%(YASM.ObjectFile)' != ''"/> + + + + + + + + <_YASMReadTlog Remove="@(_YASMReadTlog)" /> + <_YASMWriteTlog Remove="@(_YASMWriteTlog)" /> + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/build.vs/vsyasm.xml gmp-ecm-7.0.5+ds/build.vs/vsyasm.xml --- gmp-ecm-7.0.4+ds/build.vs/vsyasm.xml 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/build.vs/vsyasm.xml 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,283 @@ + + + + + + + + + + + + + General + + + + + + Symbols + + + + + + Files + + + + + + Command Line + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Execute Before + + + Specifies the targets for the build customization to run before. + + + + + + + + + + + + Execute After + + + Specifies the targets for the build customization to run after. + + + + + + + + + + + + + + + + + + Additional Options + + + Additional Options + + + + + + + + \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/c270 gmp-ecm-7.0.5+ds/c270 --- gmp-ecm-7.0.4+ds/c270 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/c270 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +119289271643410661659011895607772139647921376305643327040133340560424162255590014476006522815032183459410826649812381782024580435488953253421188979709766709602606088795137550915161002646980490840248715831209139007534778111499928885914673593417099541437886281968493172159 diff -Nru gmp-ecm-7.0.4+ds/cgbn_stage1.cu gmp-ecm-7.0.5+ds/cgbn_stage1.cu --- gmp-ecm-7.0.4+ds/cgbn_stage1.cu 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/cgbn_stage1.cu 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,888 @@ +/* cgbn_stage1.h: header for CGBN (GPU) based ecm stage 1. + +Copyright 2021 Seth Troisi + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, see +http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#ifndef _CGBN_STAGE1_CU +#define _CGBN_STAGE1_CU 1 + +#ifndef __CUDACC__ +#error "This file should only be compiled with nvcc" +#endif + +#include "cgbn_stage1.h" + +#include +#include +#include +#include + +// GMP import must proceed cgbn.h +#include +#include +#include + +#include "cudacommon.h" + +#include "ecm.h" +#include "ecm-gpu.h" + + +// See cgbn_error_t enum (cgbn.h:39) +#define cgbn_normalized_error ((cgbn_error_t) 14) +#define cgbn_positive_overflow ((cgbn_error_t) 15) +#define cgbn_negative_overflow ((cgbn_error_t) 16) + +// Seems to adds very small overhead (1-10%) +#define VERIFY_NORMALIZED 0 +// Adds even less overhead (<1%) +#define CHECK_ERROR 1 + +// Tested with check_gpuecm.sage +#define CARRY_BITS 6 + +// Can dramatically change compile time +#if 1 + #define FORCE_INLINE __forceinline__ +#else + #define FORCE_INLINE +#endif + +// support routine copied from "CGBN/samples/utility/support.h" +void cgbn_check(cgbn_error_report_t *report, const char *file=NULL, int32_t line=0) { + // check for cgbn errors + + if(cgbn_error_report_check(report)) { + fprintf (stderr, "\n"); + fprintf (stderr, "CGBN error occurred: %s\n", cgbn_error_string(report)); + + if(report->_instance!=0xFFFFFFFF) { + fprintf (stderr, "Error reported by instance %d", report->_instance); + if(report->_blockIdx.x!=0xFFFFFFFF) + fprintf (stderr, ", blockIdx=(%d, %d, %d)", report->_blockIdx.x, report->_blockIdx.y, report->_blockIdx.z); + if(report->_threadIdx.x!=0xFFFFFFFF) + fprintf (stderr, ", threadIdx=(%d, %d, %d)", report->_threadIdx.x, report->_threadIdx.y, report->_threadIdx.z); + fprintf (stderr, "\n"); + } + else { + fprintf (stderr, "Error reported by blockIdx=(%d %d %d)", report->_blockIdx.x, report->_blockIdx.y, report->_blockIdx.z); + fprintf (stderr, "threadIdx=(%d %d %d)\n", report->_threadIdx.x, report->_threadIdx.y, report->_threadIdx.z); + } + if(file!=NULL) + fprintf (stderr, "file %s, line %d\n", file, line); + exit(1); + } +} + +#define CGBN_CHECK(report) cgbn_check(report, __FILE__, __LINE__) + +static +void to_mpz(mpz_t r, const uint32_t *x, uint32_t count) { + mpz_import (r, count, -1, sizeof(uint32_t), 0, 0, x); +} + +static +void from_mpz(const mpz_t s, uint32_t *x, uint32_t count) { + size_t words; + + if(mpz_sizeinbase (s, 2) > count * 32) { + fprintf (stderr, "from_mpz failed -- result does not fit\n"); + exit(EXIT_FAILURE); + } + + mpz_export (x, &words, -1, sizeof(uint32_t), 0, 0, s); + while(words= 512 may not be supported for > 2048 bit kernels */ +const uint32_t TPB_DEFAULT = 256; + +template +class cgbn_params_t { + public: + // parameters used by the CGBN context + static const uint32_t TPB=TPB_DEFAULT; // Reasonable default + static const uint32_t MAX_ROTATION=4; // good default value + static const uint32_t SHM_LIMIT=0; // no shared mem available + static const bool CONSTANT_TIME=false; // not implemented + + // parameters used locally in the application + static const uint32_t TPI=tpi; // threads per instance + static const uint32_t BITS=bits; // instance size +}; + + +template +class curve_t { + public: + + typedef cgbn_context_t context_t; + typedef cgbn_env_t env_t; + typedef typename env_t::cgbn_t bn_t; + typedef cgbn_mem_t mem_t; + + context_t _context; + env_t _env; + int32_t _instance; // which curve instance is this + + // Constructor + __device__ FORCE_INLINE curve_t(cgbn_monitor_t monitor, cgbn_error_report_t *report, int32_t instance) : + _context(monitor, report, (uint32_t)instance), _env(_context), _instance(instance) {} + + // Verify 0 <= r < modulus + __device__ FORCE_INLINE void assert_normalized(bn_t &r, const bn_t &modulus) { + //if (VERIFY_NORMALIZED && _context.check_errors()) + if (VERIFY_NORMALIZED && CHECK_ERROR) { + + // Negative overflow + if (cgbn_extract_bits_ui32(_env, r, params::BITS-1, 1)) { + _context.report_error(cgbn_negative_overflow); + } + // Positive overflow + if (cgbn_compare(_env, r, modulus) >= 0) { + _context.report_error(cgbn_positive_overflow); + } + } + } + + // Normalize after addition + __device__ FORCE_INLINE void normalize_addition(bn_t &r, const bn_t &modulus) { + if (cgbn_compare(_env, r, modulus) >= 0) { + cgbn_sub(_env, r, r, modulus); + } + } + + // Normalize after subtraction (handled instead by checking carry) + /* + __device__ FORCE_INLINE void normalize_subtraction(bn_t &r, const bn_t &modulus) { + if (cgbn_extract_bits_ui32(_env, r, params::BITS-1, 1)) { + cgbn_add(_env, r, r, modulus); + } + } + */ + + /** + * Calculate (r * m) / 2^32 mod modulus + * + * This removes a factor of 2^32 which is not present in m. + * Otherwise m (really d) needs to be passed as a bigint not a uint32 + */ + __device__ FORCE_INLINE void special_mult_ui32(bn_t &r, uint32_t m, const bn_t &modulus, uint32_t np0) { + //uint32_t thread_i = (blockIdx.x*blockDim.x + threadIdx.x)%params::TPI; + bn_t temp; + + uint32_t carry_t1 = cgbn_mul_ui32(_env, r, r, m); + uint32_t t1_0 = cgbn_extract_bits_ui32(_env, r, 0, 32); + uint32_t q = t1_0 * np0; + uint32_t carry_t2 = cgbn_mul_ui32(_env, temp, modulus, q); + + cgbn_shift_right(_env, r, r, 32); + cgbn_shift_right(_env, temp, temp, 32); + // Add back overflow carry + cgbn_insert_bits_ui32(_env, r, r, params::BITS-32, 32, carry_t1); + cgbn_insert_bits_ui32(_env, temp, temp, params::BITS-32, 32, carry_t2); + + // This needs to be measured at block containing top bit of modulus + int32_t carry_q = cgbn_add(_env, r, r, temp); + carry_q += cgbn_add_ui32(_env, r, r, t1_0 != 0); // add 1 + while (carry_q != 0) { + carry_q -= cgbn_sub(_env, r, r, modulus); + } + + // 0 <= r, temp < modulus => r + temp + 1 < 2*modulus + if (cgbn_compare(_env, r, modulus) >= 0) { + cgbn_sub(_env, r, r, modulus); + } + } + + + __device__ FORCE_INLINE void double_add_v2( + bn_t &q, bn_t &u, + bn_t &w, bn_t &v, + uint32_t d, + const bn_t &modulus, + const uint32_t np0) { + // q = xA = aX + // u = zA = aY + // w = xB = bX + // v = zB = bY + + /* Doesn't seem to be a large cost to using many extra variables */ + bn_t t, CB, DA, AA, BB, K, dK; + + /* Can maybe use one more bit if cgbn_add subtracts when carry happens */ + + cgbn_add(_env, t, v, w); // t = (bY + bX) + normalize_addition(t, modulus); + if (cgbn_sub(_env, v, v, w)) // v = (bY - bX) + cgbn_add(_env, v, v, modulus); + + + cgbn_add(_env, w, u, q); // w = (aY + aX) + normalize_addition(w, modulus); + if (cgbn_sub(_env, u, u, q)) // u = (aY - aX) + cgbn_add(_env, u, u, modulus); + if (VERIFY_NORMALIZED) { + assert_normalized(t, modulus); + assert_normalized(v, modulus); + assert_normalized(w, modulus); + assert_normalized(u, modulus); + } + + cgbn_mont_mul(_env, CB, t, u, modulus, np0); // C*B + normalize_addition(CB, modulus); + cgbn_mont_mul(_env, DA, v, w, modulus, np0); // D*A + normalize_addition(DA, modulus); + + /* Roughly 40% of time is spent in these two calls */ + cgbn_mont_sqr(_env, AA, w, modulus, np0); // AA + cgbn_mont_sqr(_env, BB, u, modulus, np0); // BB + normalize_addition(AA, modulus); + normalize_addition(BB, modulus); + if (VERIFY_NORMALIZED) { + assert_normalized(CB, modulus); + assert_normalized(DA, modulus); + assert_normalized(AA, modulus); + assert_normalized(BB, modulus); + } + + // q = aX is finalized + cgbn_mont_mul(_env, q, AA, BB, modulus, np0); // AA*BB + normalize_addition(q, modulus); + assert_normalized(q, modulus); + + if (cgbn_sub(_env, K, AA, BB)) // K = AA-BB + cgbn_add(_env, K, K, modulus); + + // By definition of d = (sigma / 2^32) % MODN + // K = k*R + // dK = d*k*R = (K * R * sigma) >> 32 + cgbn_set(_env, dK, K); + special_mult_ui32(dK, d, modulus, np0); // dK = K*d + assert_normalized(dK, modulus); + + cgbn_add(_env, u, BB, dK); // BB + dK + normalize_addition(u, modulus); + if (VERIFY_NORMALIZED) { + assert_normalized(K, modulus); + assert_normalized(dK, modulus); + assert_normalized(u, modulus); + } + + // u = aY is finalized + cgbn_mont_mul(_env, u, K, u, modulus, np0); // K(BB+dK) + normalize_addition(u, modulus); + assert_normalized(u, modulus); + + cgbn_add(_env, w, DA, CB); // DA + CB + normalize_addition(w, modulus); + if (cgbn_sub(_env, v, DA, CB)) // DA - CB + cgbn_add(_env, v, v, modulus); + if (VERIFY_NORMALIZED) { + assert_normalized(w, modulus); + assert_normalized(v, modulus); + } + + // w = bX is finalized + cgbn_mont_sqr(_env, w, w, modulus, np0); // (DA+CB)^2 mod N + normalize_addition(w, modulus); + assert_normalized(w, modulus); + + cgbn_mont_sqr(_env, v, v, modulus, np0); // (DA-CB)^2 mod N + normalize_addition(v, modulus); + assert_normalized(v, modulus); + + // v = bY is finalized + cgbn_shift_left(_env, v, v, 1); // double + normalize_addition(v, modulus); + assert_normalized(v, modulus); + } +}; + +static +uint32_t* set_p_2p(const mpz_t N, + uint32_t curves, uint32_t sigma, + uint32_t BITS, size_t *data_size) { + // P1_x, P1_y = (2,1) + // 2P_x, 2P_y = (9, 64 * d + 8) + + /** Keeps a copy of N (AKA modulo) per curve */ + const size_t limbs_per = BITS/32; + *data_size = 5 * curves * limbs_per * sizeof(uint32_t); + uint32_t *data = (uint32_t*) malloc(*data_size); + uint32_t *datum = data; + + mpz_t x; + mpz_init(x); + for(int index = 0; index < curves; index++) { + // d = (sigma / 2^32) mod N BUT 2^32 handled by special_mul_ui32 + uint32_t d = sigma + index; + + // mod + from_mpz(N, datum + 0 * limbs_per, BITS/32); + + // P1 (X, Y) + mpz_set_ui(x, 2); + from_mpz(x, datum + 1 * limbs_per, BITS/32); + mpz_set_ui(x, 1); + from_mpz(x, datum + 2 * limbs_per, BITS/32); + + // 2P = P2 (X, Y) + // P2_y = 64 * d + 8 + mpz_set_ui(x, 9); + from_mpz(x, datum + 3 * limbs_per, BITS/32); + + // d = sigma * mod_inverse(2 ** 32, N) + mpz_ui_pow_ui(x, 2, 32); + mpz_invert(x, x, N); + mpz_mul_ui(x, x, d); + // P2_y = 64 * d - 2; + mpz_mul_ui(x, x, 64); + mpz_add_ui(x, x, 8); + mpz_mod(x, x, N); + + outputf (OUTPUT_TRACE, "sigma %d => P2_y: %Zd\n", d, x); + from_mpz(x, datum + 4 * limbs_per, BITS/32); + datum += 5 * limbs_per; + } + mpz_clear(x); + return data; +} + + +// kernel implementation using cgbn +template +__global__ void kernel_double_add( + cgbn_error_report_t *report, + uint64_t s_bits, + uint64_t s_bits_start, + uint64_t s_bits_interval, + uint32_t* gpu_s_bits, + uint32_t *data, + uint32_t count, + uint32_t sigma_0, + uint32_t np0 + ) { + // decode an instance_i number from the blockIdx and threadIdx + int32_t instance_i = (blockIdx.x*blockDim.x + threadIdx.x)/params::TPI; + if(instance_i >= count) + return; + + /* Cast uint32_t array to mem_t */ + typename curve_t::mem_t *data_cast = (typename curve_t::mem_t*) data; + + cgbn_monitor_t monitor = CHECK_ERROR ? cgbn_report_monitor : cgbn_no_checks; + + curve_t curve(monitor, report, instance_i); + typename curve_t::bn_t aX, aY, bX, bY, modulus; + + { // Setup + cgbn_load(curve._env, modulus, &data_cast[5*instance_i+0]); + cgbn_load(curve._env, aX, &data_cast[5*instance_i+1]); + cgbn_load(curve._env, aY, &data_cast[5*instance_i+2]); + cgbn_load(curve._env, bX, &data_cast[5*instance_i+3]); + cgbn_load(curve._env, bY, &data_cast[5*instance_i+4]); + + /* Convert points to mont, has a miniscule bit of overhead with batching. */ + uint32_t np0_test = cgbn_bn2mont(curve._env, aX, aX, modulus); + assert(np0 == np0_test); + + cgbn_bn2mont(curve._env, aY, aY, modulus); + cgbn_bn2mont(curve._env, bX, bX, modulus); + cgbn_bn2mont(curve._env, bY, bY, modulus); + + { + curve.assert_normalized(aX, modulus); + curve.assert_normalized(aY, modulus); + curve.assert_normalized(bX, modulus); + curve.assert_normalized(bY, modulus); + } + } + + uint32_t d = sigma_0 + instance_i; + int swapped = 0; + for (uint32_t b = s_bits_start; b < s_bits_start + s_bits_interval; b++) { + /* Process bits from MSB to LSB, last index to first index + * b counts from 0 to s_num_bits */ + int nth = s_bits - 1 - b; + int bit = (gpu_s_bits[nth/32] >> (nth&31)) & 1; + if (bit != swapped) { + swapped = !swapped; + cgbn_swap(curve._env, aX, bX); + cgbn_swap(curve._env, aY, bY); + } + curve.double_add_v2(aX, aY, bX, bY, d, modulus, np0); + } + + if (swapped) { + cgbn_swap(curve._env, aX, bX); + cgbn_swap(curve._env, aY, bY); + } + + { // Final output + // Convert everything back to bn + cgbn_mont2bn(curve._env, aX, aX, modulus, np0); + cgbn_mont2bn(curve._env, aY, aY, modulus, np0); + cgbn_mont2bn(curve._env, bX, bX, modulus, np0); + cgbn_mont2bn(curve._env, bY, bY, modulus, np0); + + { + curve.assert_normalized(aX, modulus); + curve.assert_normalized(aY, modulus); + curve.assert_normalized(bX, modulus); + curve.assert_normalized(bY, modulus); + } + cgbn_store(curve._env, &data_cast[5*instance_i+1], aX); + cgbn_store(curve._env, &data_cast[5*instance_i+2], aY); + cgbn_store(curve._env, &data_cast[5*instance_i+3], bX); + cgbn_store(curve._env, &data_cast[5*instance_i+4], bY); + } +} + +static +int findfactor(mpz_t factor, const mpz_t N, const mpz_t x_final, const mpz_t y_final) { + // XXX: combine / refactor logic with cudawrapper.c findfactor + + mpz_t temp; + mpz_init(temp); + + /* Check if factor found */ + bool inverted = mpz_invert(temp, y_final, N); // aY ^ (N-2) % N + + if (inverted) { + mpz_mul(temp, x_final, temp); // aX * aY^-1 + mpz_mod(factor, temp, N); // "Residual" + mpz_clear(temp); + return ECM_NO_FACTOR_FOUND; + } + mpz_clear(temp); + + mpz_gcd(factor, y_final, N); + return ECM_FACTOR_FOUND_STEP1; +} + + +static +int verify_size_of_n(const mpz_t N, size_t max_bits) { + size_t n_log2 = mpz_sizeinbase(N, 2); + + /* Using check_gpuecm.sage it looks like 4 bits would suffice. */ + size_t max_usable_bits = max_bits - CARRY_BITS; + + if (n_log2 <= max_usable_bits) + return ECM_NO_FACTOR_FOUND; + + outputf (OUTPUT_ERROR, "GPU: N(%d bits) + carry(%d bits) > BITS(%d)\n", + n_log2, CARRY_BITS, max_bits); + outputf (OUTPUT_ERROR, "GPU: Error, input number should be stricly lower than 2^%d\n", + max_usable_bits); + return ECM_ERROR; +} + + +static +uint32_t find_np0(const mpz_t N) { + uint32_t np0; + mpz_t temp; + mpz_init(temp); + mpz_ui_pow_ui(temp, 2, 32); + assert(mpz_invert(temp, N, temp)); + np0 = -mpz_get_ui(temp); + mpz_clear(temp); + return np0; +} + + +static +uint32_t* allocate_and_set_s_bits(const mpz_t s, uint64_t *nbits) { + uint64_t num_bits = *nbits = mpz_sizeinbase (s, 2); + + uint64_t allocated = (num_bits + 31) / 32; + uint32_t *s_bits = (uint32_t*) malloc (sizeof(uint32_t) * allocated); + + uint64_t countp; + mpz_export (s_bits, &countp, -1, sizeof(uint32_t), 0, 0, s); + assert (countp == allocated); + + return s_bits; +} + +static +int process_results(mpz_t *factors, int *array_found, + const mpz_t N, + const uint32_t *data, uint32_t cgbn_bits, + int curves, uint32_t sigma) { + mpz_t x_final, y_final, modulo; + mpz_init(modulo); + mpz_init(x_final); + mpz_init(y_final); + + const uint32_t limbs_per = cgbn_bits / 32; + + int youpi = ECM_NO_FACTOR_FOUND; + int errors = 0; + for(size_t i = 0; i < curves; i++) { + const uint32_t *datum = data + (5 * i * limbs_per);; + + if (test_verbose (OUTPUT_TRACE) && i == 0) { + to_mpz(modulo, datum + 0 * limbs_per, limbs_per); + outputf (OUTPUT_TRACE, "index: 0 modulo: %Zd\n", modulo); + + to_mpz(x_final, datum + 1 * limbs_per, limbs_per); + to_mpz(y_final, datum + 2 * limbs_per, limbs_per); + outputf (OUTPUT_TRACE, "index: 0 pA: (%Zd, %Zd)\n", x_final, y_final); + + to_mpz(x_final, datum + 3 * limbs_per, limbs_per); + to_mpz(y_final, datum + 4 * limbs_per, limbs_per); + outputf (OUTPUT_TRACE, "index: 0 pB: (%Zd, %Zd)\n", x_final, y_final); + } + + // Make sure we were testing the right number. + to_mpz(modulo, datum + 0 * limbs_per, limbs_per); + assert(mpz_cmp(modulo, N) == 0); + + to_mpz(x_final, datum + 1 * limbs_per, limbs_per); + to_mpz(y_final, datum + 2 * limbs_per, limbs_per); + + /* Very suspicious for (x_final, y_final) to match (x_0, y_0) == (2, 1) + * Can happen when + * 1. block calculation performed incorrectly (and some blocks not run) + * 2. Kernel didn't run because not enough register + * 3. nvcc links old version of kernel when something changed + */ + if (mpz_cmp_ui (x_final, 2) == 0 && mpz_cmp_ui (y_final, 1) == 0) { + errors += 1; + if (errors < 10 || errors % 100 == 1) + outputf (OUTPUT_ERROR, "GPU: curve %d didn't compute?\n", i); + } + + array_found[i] = findfactor(factors[i], N, x_final, y_final); + if (array_found[i] != ECM_NO_FACTOR_FOUND) { + youpi = array_found[i]; + outputf (OUTPUT_NORMAL, "GPU: factor %Zd found in Step 1 with curve %ld (-sigma %d:%d)\n", + factors[i], i, ECM_PARAM_BATCH_32BITS_D, sigma + i); + } + } + + mpz_init(modulo); + mpz_clear(x_final); + mpz_clear(y_final); + +#ifdef IS_DEV_BUILD + if (errors) + outputf (OUTPUT_ERROR, "Had %d errors. Try `make clean; make` or reducing TPB_DEFAULT\n", + errors); +#endif + + if (errors > 2) + return ECM_ERROR; + + return youpi; +} + +int cgbn_ecm_stage1(mpz_t *factors, int *array_found, + const mpz_t N, const mpz_t s, + uint32_t curves, uint32_t sigma, + float *gputime, int verbose) +{ + assert( sigma > 0 ); + assert( ((uint64_t) sigma + curves) <= 0xFFFFFFFF ); // no overflow + + uint64_t s_num_bits; + uint32_t *s_bits = allocate_and_set_s_bits(s, &s_num_bits); + assert( s_bits != NULL ); + if (s_num_bits >= 100000000) + outputf (OUTPUT_NORMAL, "GPU: Large B1, S = %'d bits = %d MB\n", + s_num_bits, s_num_bits >> 20); + + cudaEvent_t global_start, batch_start, stop; + CUDA_CHECK(cudaEventCreate (&global_start)); + CUDA_CHECK(cudaEventCreate (&batch_start)); + CUDA_CHECK(cudaEventCreate (&stop)); + CUDA_CHECK(cudaEventRecord (global_start)); + + // Copy s_bits + uint32_t *gpu_s_bits; + uint32_t s_words = (s_num_bits + 31) / 32; + CUDA_CHECK(cudaMalloc((void **)&gpu_s_bits, sizeof(uint32_t) * s_words)); + CUDA_CHECK(cudaMemcpy(gpu_s_bits, s_bits, sizeof(uint32_t) * s_words, cudaMemcpyHostToDevice)); + + cgbn_error_report_t *report; + // create a cgbn_error_report for CGBN to report back errors + CUDA_CHECK(cgbn_error_report_alloc(&report)); + + size_t data_size; + uint32_t *data, *gpu_data; + + uint32_t BITS = 0; // kernel bits + int32_t TPB=TPB_DEFAULT; // Always the same default + int32_t TPI; + int32_t IPB; // IPB = TPB / TPI, instances per block + size_t BLOCK_COUNT; // How many blocks to cover all curves + + /** + * Smaller TPI is faster, Larger TPI is needed for large inputs. + * N > 512 TPI=8 | N > 2048 TPI=16 | N > 8192 TPI=32 + * + * Larger takes longer to compile (and increases binary size) + * No GPU, No CGBN | ecm 3.4M, 2 seconds to compile + * GPU, No CGBN | ecm 3.5M, 3 seconds + * (8, 1024) | ecm 3.8M, 12 seconds + * (16,8192) | ecm 4.2M, 1 minute + * (32,16384) | ecm 4.2M, 1 minute + * (32,32768) | ecm 5.2M, 4.7 minutes + */ + /* NOTE: Custom kernel changes here + * For "Compling custom kernel for %d bits should be XX% faster" + * Change the 512 in cgbn_params_t<4, 512> cgbn_params_512; + * to the suggested value (a multiple of 32 >= bits + 6). + * You may need to change the 4 to an 8 (or 16) if bits >512, >2048 + */ + /** TODO: try with const vector for BITs/TPI, see if compiler is happy */ + std::vector available_kernels; + + typedef cgbn_params_t<4, 512> cgbn_params_512; + typedef cgbn_params_t<8, 1024> cgbn_params_1024; + available_kernels.push_back((uint32_t)cgbn_params_512::BITS); + available_kernels.push_back((uint32_t)cgbn_params_1024::BITS); + +#ifndef IS_DEV_BUILD + /** + * TPI and BITS have to be set at compile time. Adding multiple cgbn_params + * (and their associated kernels) allows for better dynamic selection based + * on the size of N (e.g. N < 1024, N < 2048, N < 4096) but increase compile + * time and binary size. A few reasonable sizes are included and a verbose + * warning is printed when a particular N might benefit from a custom sized + * kernel. + */ + typedef cgbn_params_t<8, 1536> cgbn_params_1536; + typedef cgbn_params_t<8, 2048> cgbn_params_2048; + typedef cgbn_params_t<16, 3072> cgbn_params_3072; + typedef cgbn_params_t<16, 4096> cgbn_params_4096; + available_kernels.push_back((uint32_t)cgbn_params_1536::BITS); + available_kernels.push_back((uint32_t)cgbn_params_2048::BITS); + available_kernels.push_back((uint32_t)cgbn_params_3072::BITS); + available_kernels.push_back((uint32_t)cgbn_params_4096::BITS); +#endif + + size_t n_log2 = mpz_sizeinbase(N, 2); + for (int k_i = 0; k_i < available_kernels.size(); k_i++) { + uint32_t kernel_bits = available_kernels[k_i]; + if (kernel_bits >= n_log2 + CARRY_BITS) { + BITS = kernel_bits; + assert( BITS % 32 == 0 ); + + /* Print some debug info about kernel. */ + /* TODO: return kernelAttr and validate maxThreadsPerBlock. */ + if (BITS == cgbn_params_512::BITS) { + TPI = cgbn_params_512::TPI; + kernel_info((const void*)kernel_double_add, verbose); + } else if (BITS == cgbn_params_1024::BITS) { + TPI = cgbn_params_1024::TPI; + kernel_info((const void*)kernel_double_add, verbose); +#ifndef IS_DEV_BUILD + } else if (BITS == cgbn_params_1536::BITS) { + TPI = cgbn_params_1536::TPI; + kernel_info((const void*)kernel_double_add, verbose); + } else if (BITS == cgbn_params_2048::BITS) { + TPI = cgbn_params_2048::TPI; + kernel_info((const void*)kernel_double_add, verbose); + } else if (BITS == cgbn_params_3072::BITS) { + TPI = cgbn_params_3072::TPI; + kernel_info((const void*)kernel_double_add, verbose); + } else if (BITS == cgbn_params_4096::BITS) { + TPI = cgbn_params_4096::TPI; + kernel_info((const void*)kernel_double_add, verbose); +#endif + } else { + /* lowercase k to help differentiate this error from one below */ + outputf (OUTPUT_ERROR, "CGBN kernel not found for %d bits\n", BITS); + return ECM_ERROR; + } + + IPB = TPB / TPI; + BLOCK_COUNT = (curves + IPB - 1) / IPB; + + break; + } + } + + if (BITS == 0) { + outputf (OUTPUT_ERROR, "No available CGBN Kernel large enough to process N(%d bits)\n", n_log2); + return ECM_ERROR; + } + + /* Alert that recompiling with a smaller kernel would likely improve speed */ + { + size_t optimized_bits = ((n_log2 + 5)/128 + 1) * 128; + if (optimized_bits < BITS && 0.8 * BITS > n_log2 ) { + /* Assume speed is roughly O(N) but slightly slower for not being a power of two */ + float pct_faster = 90 * BITS / optimized_bits; + assert(pct_faster > 100); + outputf (OUTPUT_VERBOSE, "Compiling custom kernel for %d bits should be ~%.0f%% faster\n", + optimized_bits, pct_faster); + } + } + + int youpi = verify_size_of_n(N, BITS); + if (youpi != ECM_NO_FACTOR_FOUND) { + return youpi; + } + + /* Consistency check that struct cgbn_mem_t is byte aligned without extra fields. */ + assert( sizeof(curve_t::mem_t) == cgbn_params_512::BITS/8 ); + assert( sizeof(curve_t::mem_t) == cgbn_params_1024::BITS/8 ); + data = set_p_2p(N, curves, sigma, BITS, &data_size); + + /* np0 is -(N^-1 mod 2**32), used for montgomery representation */ + uint32_t np0 = find_np0(N); + + // Copy data + outputf (OUTPUT_VERBOSE, "Copying %d bits of data to GPU\n", data_size); + CUDA_CHECK(cudaMalloc((void **)&gpu_data, data_size)); + CUDA_CHECK(cudaMemcpy(gpu_data, data, data_size, cudaMemcpyHostToDevice)); + + outputf (OUTPUT_VERBOSE, + "CGBN<%d, %d> running kernel<%d block x %d threads> input number is %d bits\n", + BITS, TPI, BLOCK_COUNT, TPB, n_log2); + + /* First bit (doubling) is handled in set_p_2p */ + uint64_t s_partial = 1; + + /* Start with small batches and increase till timing is ~100ms */ + uint64_t batch_size = 100; + + int batches_complete = 0; + /* gputime and batch_time are measured in ms */ + float batch_time = 0; + + while (s_partial < s_num_bits) { + /* decrease batch_size for final batch if needed */ + batch_size = std::min(s_num_bits - s_partial, batch_size); + + /* print ETA with lessing frequently, 5 early + 5 per 10s + 5 per 100s + every 1000s */ + if ((batches_complete < 3) || + (batches_complete < 30 && batches_complete % 10 == 0) || + (batches_complete < 500 && batches_complete % 100 == 0) || + (batches_complete < 5000 && batches_complete % 1000 == 0) || + (batches_complete % 10000 == 0)) { + outputf (OUTPUT_VERBOSE, "Computing %d bits/call, %d/%d (%.1f%%)", + batch_size, s_partial, s_num_bits, 100.0 * s_partial / s_num_bits); + if (batches_complete < 2 || *gputime < 1000) { + outputf (OUTPUT_VERBOSE, "\n"); + } else { + float estimated_total = (*gputime) * ((float) s_num_bits) / s_partial; + float eta = estimated_total - (*gputime); + outputf (OUTPUT_VERBOSE, ", ETA %.f + %.f = %.f seconds (~%.f ms/curves)\n", + eta / 1000, *gputime / 1000, estimated_total / 1000, + estimated_total / curves); + } + } + + CUDA_CHECK(cudaEventRecord (batch_start)); + + if (BITS == cgbn_params_512::BITS) { + kernel_double_add<<>>( + report, s_num_bits, s_partial, batch_size, gpu_s_bits, gpu_data, curves, sigma, np0); + } else if (BITS == cgbn_params_1024::BITS) { + kernel_double_add<<>>( + report, s_num_bits, s_partial, batch_size, gpu_s_bits, gpu_data, curves, sigma, np0); +#ifndef IS_DEV_BUILD + } else if (BITS == cgbn_params_1536::BITS) { + kernel_double_add<<>>( + report, s_num_bits, s_partial, batch_size, gpu_s_bits, gpu_data, curves, sigma, np0); + } else if (BITS == cgbn_params_2048::BITS) { + kernel_double_add<<>>( + report, s_num_bits, s_partial, batch_size, gpu_s_bits, gpu_data, curves, sigma, np0); + } else if (BITS == cgbn_params_3072::BITS) { + kernel_double_add<<>>( + report, s_num_bits, s_partial, batch_size, gpu_s_bits, gpu_data, curves, sigma, np0); + } else if (BITS == cgbn_params_4096::BITS) { + kernel_double_add<<>>( + report, s_num_bits, s_partial, batch_size, gpu_s_bits, gpu_data, curves, sigma, np0); +#endif + } else { + outputf (OUTPUT_ERROR, "CGBN Kernel not found for %d bits\n", BITS); + return ECM_ERROR; + } + + s_partial += batch_size; + batches_complete++; + + /* error report uses managed memory, sync the device and check for cgbn errors */ + CUDA_CHECK(cudaDeviceSynchronize()); + if (report->_error) + outputf (OUTPUT_ERROR, "\n\nerror: %d\n", report->_error); + CGBN_CHECK(report); + + CUDA_CHECK(cudaEventRecord (stop)); + CUDA_CHECK(cudaEventSynchronize (stop)); + cudaEventElapsedTime (&batch_time, batch_start, stop); + cudaEventElapsedTime (gputime, global_start, stop); + /* Adjust batch_size to aim for 100ms */ + if (batch_time < 80) { + batch_size = 11*batch_size/10; + } else if (batch_time > 120) { + batch_size = max(100ul, 9*batch_size / 10); + } + } + + // Copy data back from GPU memory + outputf (OUTPUT_VERBOSE, "Copying results back to CPU ...\n"); + CUDA_CHECK(cudaMemcpy(data, gpu_data, data_size, cudaMemcpyDeviceToHost)); + + cudaEventElapsedTime (gputime, global_start, stop); + + youpi = process_results(factors, array_found, N, data, BITS, curves, sigma); + + // clean up + CUDA_CHECK(cudaFree(gpu_s_bits)); + CUDA_CHECK(cudaFree(gpu_data)); + CUDA_CHECK(cgbn_error_report_free(report)); + CUDA_CHECK(cudaEventDestroy (global_start)); + CUDA_CHECK(cudaEventDestroy (batch_start)); + CUDA_CHECK(cudaEventDestroy (stop)); + + free(s_bits); + free(data); + + return youpi; +} + +#ifdef __CUDA_ARCH__ + #if __CUDA_ARCH__ < 300 + #error "Unsupported architecture" + #endif +#endif + +#endif /* _CGBN_STAGE1_CU */ diff -Nru gmp-ecm-7.0.4+ds/cgbn_stage1.h gmp-ecm-7.0.5+ds/cgbn_stage1.h --- gmp-ecm-7.0.4+ds/cgbn_stage1.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/cgbn_stage1.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,42 @@ +/* cgbn_stage1.h: header for CGBN (GPU) based ecm stage 1. + + Copyright 2021 Seth Troisi + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; see the file COPYING. If not, write to the Free + Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + 02111-1307, USA. +*/ + +#ifndef _CGBN_STAGE1_H +#define _CGBN_STAGE1_H 1 + +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int cgbn_ecm_stage1(mpz_t *factors, int *array_found, + const mpz_t N, const mpz_t s, + uint32_t curves, uint32_t sigma, + float *gputime, int verbose); + +#ifdef __cplusplus +} +#endif + + +#endif /* _CGBN_STAGE1_H */ diff -Nru gmp-ecm-7.0.4+ds/champions.h gmp-ecm-7.0.5+ds/champions.h --- gmp-ecm-7.0.4+ds/champions.h 2016-10-11 09:26:27.000000000 +0000 +++ gmp-ecm-7.0.5+ds/champions.h 2022-06-06 14:16:49.000000000 +0000 @@ -7,8 +7,8 @@ "Paul Zimmermann ", "Paul Zimmermann "}; static char *champion_url[3] = -{"http://wwwmaths.anu.edu.au/~brent/ftp/champs.txt", +{"http://maths-people.anu.edu.au/~brent/ftp/champs.txt", "http://www.loria.fr/~zimmerma/records/Pminus1.html", "http://www.loria.fr/~zimmerma/records/Pplus1.html"}; /* minimal number of digits to enter the champions table for ECM, P-1, P+1 */ -static unsigned int champion_digits[3] = { 73, 54, 48 }; +static unsigned int champion_digits[3] = { 74, 54, 48 }; diff -Nru gmp-ecm-7.0.4+ds/ChangeLog gmp-ecm-7.0.5+ds/ChangeLog --- gmp-ecm-7.0.4+ds/ChangeLog 2016-10-11 09:25:46.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ChangeLog 2022-06-06 14:16:49.000000000 +0000 @@ -1,3 +1,1769 @@ +commit 8b63ba3270020908ff2c605f00ac95a541340367 +Author: Paul Zimmermann +Date: Thu May 5 11:35:58 2022 +0200 + + fixed compiler warnings + +commit 1afb36ebf693014eb62e7c21590c0f29afd57e0f +Author: Seth Troisi +Date: Thu Mar 17 02:01:48 2022 -0700 + + Expand list of CUDA cc codes to include 80 and 90 series + +commit f2aec916ffcde2dd0e388ce1897ced7d094259e7 +Author: Seth Troisi +Date: Thu Mar 17 01:30:58 2022 -0700 + + Faster -save with large gpu curve count + +commit c56cfceb4ed6ef7711bb59655b37735b8041db7d +Author: Seth Troisi +Date: Thu Sep 9 13:52:53 2021 -0700 + + Add Seth Troisi to authors + +commit 9d5b903396bb24ae73fad7106de71153bad4d6a3 +Author: Seth Troisi +Date: Wed Nov 10 17:59:27 2021 -0800 + + Add comment explaining extra GPU time after curves are done + +commit d013c5018f2db5a3fd5e89551d4575749b587324 +Merge: d8f7afc 0ad8a8c +Author: Paul Zimmermann +Date: Tue Mar 15 10:54:21 2022 +0100 + + Merge branch 'gpu_tests' of gitlab.inria.fr:zimmerma/ecm into gpu_tests + +commit 0ad8a8c8a0ff17c4b54a9df63d37119a780bd09e +Author: Seth Troisi +Date: Mon Aug 30 19:35:33 2021 -0700 + + check_gpuecm.sage add smallestGroupOrder + + This allows for + $ sage --preparse check_gpuecm.sage + $ mv -f check_gpuecm.sage.py check_gpuecm.py + $ python -c "import check_gpuecm; check_gpuecm.smallestGroupOrder(594538100848945223169882301931953, 3, 10, 1000)" + + Which can help find B1/B2 limits for tests where prime will be found. + +commit 923c9415152afc74f6a59fe32a4eb95fbb19c9a9 +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Mon Jan 17 15:21:50 2022 +0000 + + Update configure.ac - Fix WINDOWS64_ABI for mingw64 targets + +commit 09f4c42e8a2237657194ff06b82f780d7b278a51 +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Mon Jan 17 03:08:05 2022 +0000 + + Add mulredc1.asm for Linux and Win64 + +commit b44037d4b74f960da3bd3c0b1babdbec69011f58 +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Mon Jan 17 01:56:40 2022 +0000 + + Update mulredc1.m4 - fix mulredc1_xx for win64 + +commit bff07a7ff416b324656936753679327ec207485e +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Mon Jan 17 01:55:40 2022 +0000 + + Update mulredc.m4 - fix k=3 to k=20 for win64 + +commit 3f083d03ed8894a6339454500e97b7f4a2fc49cc +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Mon Jan 17 01:54:43 2022 +0000 + + Update autogen.py - fix k=1 and k=2 for win64 + +commit f83edb3cf665af92663baac0610644b67156da4a +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Mon Jan 17 01:52:17 2022 +0000 + + Delete mulredc1.asm - has code that breaks win64 builds + +commit cfa3304dd2abd0604bcc4da2f16e63d76063e0a1 +Author: Brian Gladman +Date: Wed Jan 26 22:44:18 2022 +0000 + + Update Visual Studio build to use CUDA v11.5 + Conditional MSVC addition in cudacommon.h for definition of int32_t + +commit 42e02b964accf4d7b3bbea099e91ef8e4678d0b5 +Merge: 4013d15 0e97deb +Author: Brian Gladman +Date: Wed Jan 26 19:50:12 2022 +0000 + + Merge branch 'master' of gitlab.inria.fr:zimmerma/ecm + +commit 0e97deb611f15e2547cb46f6cd6216a31e434433 +Author: Paul Zimmermann +Date: Wed Dec 1 14:56:43 2021 +0100 + + added section about GMP-ECM packages + +commit 3f8604b2974ba59fdf0a3a534782ba87abfc79ef +Author: Paul Zimmermann +Date: Wed Dec 1 14:50:51 2021 +0100 + + renamed back + +commit a67270777c6ad582fe9c8c3b9580189ce8d705f3 +Author: Paul Zimmermann +Date: Wed Dec 1 14:49:29 2021 +0100 + + renamed README to README.md + +commit 4b8d0bf2462c5741d0ba407d7deb0be7dce87c6c +Author: Seth Troisi +Date: Sun Oct 24 00:14:01 2021 -0700 + + cgbn_stage1: Remove B1 limit, Reduce GPU memory 8x + +commit f7bcd6750aad10c2515e6b7f0d5093b29a530c68 +Author: Seth Troisi +Date: Sat Oct 16 05:25:52 2021 -0700 + + Remove getNumberOfBlockPerMultiProcessor + + 2.0 and 2.1 are no longer + getNumberOfBlockPerMultiProcessor returns a static 2 + +commit 258c076ec164181ffba72890800654e744db1b11 +Author: Seth Troisi +Date: Fri Oct 15 02:51:17 2021 -0700 + + Add note about endianness + +commit 6dddca4a2301e4be753bebc1d10f06cd85e4a4fb +Author: Seth Troisi +Date: Mon Sep 13 12:37:42 2021 -0700 + + cgbn_stage1: Hack fix for custom kernel TPI + +commit 980d80e79c9be830bc79fab7e1a2b9afb8cad0e4 +Author: Seth Troisi +Date: Fri Sep 10 13:39:09 2021 -0700 + + cgbn_stage1: Tuning + +commit 5680a7ae3a9624045a7f456dacbc01206f2a192f +Author: Seth Troisi +Date: Thu Sep 2 00:09:37 2021 -0700 + + cgbn_stage1: Reduce duplicated inlined double_add_v2 code + +commit 10029404df1db7d64bc523ec02b88932a8c3001f +Author: Seth Troisi +Date: Wed Sep 1 14:57:44 2021 -0700 + + Remove c++11 const vector init + +commit 38dba282bf020ee07692c4b8041b47adce4c6cda +Author: Seth Troisi +Date: Tue Aug 31 00:45:39 2021 -0700 + + cuda: Refactor some common code to cudacommon.h + + Extract get_device_prob from select_and_init_GPU + +commit c1aa300fbe4401e7bc3ed582456d191d5a07670d +Author: Seth Troisi +Date: Fri Oct 1 22:22:41 2021 -0700 + + cgbn_stage1: Extract find_np0 + +commit 77948f945a4614e9eb2a82e088e8664fffd9342b +Author: Seth Troisi +Date: Fri Oct 1 21:44:08 2021 -0700 + + cgbn_stage1: remove ecm_param scruct + +commit ed13a84671c8b3e3eba01af14e24d461048e270a +Author: Seth Troisi +Date: Tue Aug 31 21:46:44 2021 -0700 + + cgbn_stage1: Improve kernel names add comment about custom kernels + +commit b18d27a7c76a2f19ccecc55d5048fe885f6ea547 +Author: Seth Troisi +Date: Mon Aug 30 19:42:18 2021 -0700 + + cgbn_stage1: Make bits_per_batch dynamic, add ETA + +commit 75672c65b54082f37be35767dde674ad2f32a4b0 +Author: Seth Troisi +Date: Wed Aug 25 11:40:06 2021 -0700 + + cgbn_stage1: break kernel calls into intervals + + Helps with system responsiveness + +commit 37cba2bc710d6f03184bfba99e1b0febc1ab39dc +Author: Seth Troisi +Date: Tue Aug 24 16:29:28 2021 -0700 + + cgbn_stage1: tune for <512 bits + +commit 6e4c1aece2c62215459d63a7dfa806cc8e0dc794 +Author: wraythex <59678213+wraythex@users.noreply.github.com> +Date: Sat Aug 28 16:23:03 2021 -0500 + + Move cudaDeviceSchedule* to device init (#3) + + * Add schedule option + * Move schedule call to device initialization + * Update cuda_check to print the error number + * Remove un-needed error output + +commit c2f89580ab20c51256178e2a20633830f27e91df +Author: Seth Troisi +Date: Tue Aug 24 15:33:43 2021 -0700 + + cgbn_stage1: dynamic bit level part 2 + +commit e8ef8f9e4e4ee93e5e31a008a722824ba72707b8 +Author: Seth Troisi +Date: Tue Aug 24 13:23:56 2021 -0700 + + cgbn_stage1: dynamic bit level part 1 + + Refactor out some param & bit logic + +commit ebf1d479c1d104fe674976843b5c166398fa0205 +Author: Seth Troisi +Date: Fri Oct 1 20:26:55 2021 -0700 + + Improve GPU input overflow detection + +commit f41e1fee4b8b69f5d6163f06aa19ec6872e9343b +Author: Seth Troisi +Date: Wed Sep 1 14:46:03 2021 -0700 + + Added automake process files to .gitignore + +commit b6edc67096dccda55abe52afd990e5f7336f9706 +Author: Seth Troisi +Date: Mon Aug 23 22:40:13 2021 -0700 + + Cleanup GPU flags in acinclude.m4 + +commit df7b3762e34492234d09640509df1f53b1a16c0c +Author: Seth Troisi +Date: Fri Aug 20 14:32:10 2021 -0700 + + Minor cleanups to makefile, acinclude, spacing in cgbn code + +commit 9d2484195442acb31e58ee780c469285be860a3b +Author: Seth Troisi +Date: Sat Oct 2 00:23:03 2021 -0700 + + Update ecm champion url + +commit 26aacfa7e6d1e86c8596cf96b3f07ee7915816ff +Author: Paul Zimmermann +Date: Thu Oct 14 17:09:08 2021 +0200 + + patch from Jerome BENOIT + +commit 237f3567bb75be9023e4fd636a32819abfc45238 +Author: Paul Zimmermann +Date: Thu Oct 14 17:05:22 2021 +0200 + + update after migration to gitlab + +commit 3631a253bf614b53473c34f1305929b42545f5ff +Author: Seth Troisi +Date: Fri Oct 1 20:25:50 2021 -0700 + + Refactor check_gpuecm.sage + + Add overflowTest and timingTest + +commit 26211282d679c1980565ff8f7275305540e47185 +Author: Seth Troisi +Date: Wed Sep 1 17:08:34 2021 -0700 + + gpu_throughput_test.sh + +commit 95ac93301e7679cd47aee3cada76a24db0b9d664 +Author: Seth Troisi +Date: Wed Sep 29 22:33:01 2021 -0700 + + test.gpuecm: Verify GPU residuals with CPU + +commit de92909baaa6f6881291cf6a1c6a43d3c4499b65 +Author: Seth Troisi +Date: Wed Sep 1 02:11:01 2021 -0700 + + Added test.cgbnecm to run test.gpuecm with -cgbn + +commit e9c197e37dc5ad0d6e20a17cbe22eae3dac89b11 +Author: Seth Troisi +Date: Fri Sep 10 16:43:41 2021 -0700 + + Removed two very old TODO files, added TODO.gpu + +commit 8f826e82035baa5361ec3a902aadad1bed16ab71 +Author: Seth Troisi +Date: Wed Aug 18 19:02:20 2021 -0700 + + Update README.gpu + + Add details on CGBN + Add common fix for cuda present error + +commit fa61067e038f494986698e65cd0cd11ae70845f6 +Author: Seth Troisi +Date: Sat Oct 2 00:01:01 2021 -0700 + + Add missing commented example to testlong.ecm + +commit 2d2e2e637afe4510f4e7f68b153fdb2880e10c78 +Author: Seth Troisi +Date: Thu Aug 19 18:38:50 2021 -0700 + + Added example CGBN GPU code + + This will be followed by a new stage 1 implementation. + The new implementation is both 2-3x faster and supports a large range of + inputs (512-32K bits). + +commit 186d4f2906cdc54099054fb262ec5cecff07e758 +Author: Seth Troisi +Date: Fri Aug 20 02:44:19 2021 -0700 + + with_cgbn_include for new CGBN gpu code + +commit 3c3d468c7cc6d140564c6188a9d961648bbca75c +Merge: 261fdbb f03297b +Author: Seth Troisi +Date: Thu Sep 30 18:43:11 2021 +0000 + + Merge branch 'user_redefine_cc' into 'master' + + disable user_redefine_cc when CC is $GCC + + Closes #21859 + + See merge request zimmerma/ecm!23 + +commit 261fdbbd8fcab38ffae9a87e019bf2e548dc1906 +Author: Seth Troisi +Date: Thu Sep 30 03:13:02 2021 -0700 + + fix two warnings from !17 (see #21859) + +commit f03297b27db6a8aff4b4d5062b955a1d8e5120a5 +Author: Seth Troisi +Date: Thu Sep 30 02:18:35 2021 -0700 + + disable user_redefine_cc when CC is $GCC + +commit 3d37c3906b1141faa746d83f301bd5381b2c16cd +Merge: 563495a 245af7b +Author: Seth Troisi +Date: Thu Sep 23 09:25:08 2021 +0000 + + Merge branch 'gpu_configure' into 'master' + + Improve --enable-gpu + + See merge request zimmerma/ecm!19 + +commit 563495ae3dfc8b09dff08478ea01691d71c178ff +Merge: 0b6d645 2f50b8a +Author: ZIMMERMANN Paul +Date: Thu Sep 16 09:35:57 2021 +0000 + + Merge branch 'improve_tests_v2' into 'master' + + Improve tests + + See merge request zimmerma/ecm!21 + +commit 2f50b8a4182538e211df0bb98f1ca38b67b216ec +Author: Seth Troisi +Date: Wed Sep 15 12:35:06 2021 -0700 + + Improve tests + + (2nd attempt and 65be45e8 was reverted in 15ef0cbc) + + Cleanup checkcode $C + Verify some error strings + Create test_dummy2.save in tests (and remove from repo) + +commit 0b6d6456c7f22da593a6e6e288341ec6e5de0e7d +Merge: 0d4db6e 1fed9c9 +Author: Seth Troisi +Date: Wed Sep 15 20:03:54 2021 +0000 + + Merge branch 'rho_fix' into 'master' + + Update standalone rho to use primesieve & improve help + + See merge request zimmerma/ecm!18 + +commit 1fed9c9eef5fc2979715c91516e5e9800b0b1944 +Author: Seth Troisi +Date: Wed Sep 15 12:51:28 2021 -0700 + + Add more usage details to rho when called without arg + +commit 07f6805b03af0b0b3a533ee8c9c0b6ee06a0e63e +Author: Seth Troisi +Date: Thu Sep 9 19:05:10 2021 -0700 + + Update standalone rho to use primesieve + + Only one small update to match functionality + + primesieve_skipto(11); primesieve_next() == 13 + primegen_skipto(11); primegen_next() == 11 + +commit 0d4db6ec345b87dba58a7285f02a2cc0944331bc +Author: Seth Troisi +Date: Wed Sep 15 12:21:51 2021 -0700 + + cleanup two warnings from !16 (see #21859) + +commit 0101f5be206e2074c5bc168cfa9237e2f86f08c4 +Merge: 3dba32a 3dca1e4 +Author: ZIMMERMANN Paul +Date: Wed Sep 15 12:57:11 2021 +0000 + + Merge branch 'step1_multiple_factors_final' into 'master' + + Add reducefactors to find primes from many composites in stage1 GPU code + + See merge request zimmerma/ecm!17 + +commit 3dba32a4e8e375f5b160379d644d7f52297d86cd +Merge: fa5525c 24a544f +Author: ZIMMERMANN Paul +Date: Wed Sep 15 12:50:17 2021 +0000 + + Merge branch 'param_cleanup' into 'master' + + Simplify get_curve_from_param[13] with mpz_invert + + See merge request zimmerma/ecm!16 + +commit fa5525c297175d86ee9fda3b7774893df60f9b87 +Merge: f2c8661 15ef0cb +Author: ZIMMERMANN Paul +Date: Wed Sep 15 12:47:22 2021 +0000 + + Merge branch 'revert-f398b6b5' into 'master' + + Revert "Merge branch 'improve_tests' into 'master'" + + See merge request zimmerma/ecm!20 + +commit 15ef0cbc56cc9675cc9259b9147ad9cf86db5fe7 +Author: ZIMMERMANN Paul +Date: Wed Sep 15 12:44:59 2021 +0000 + + Revert "Merge branch 'improve_tests' into 'master'" + + This reverts merge request !15 + +commit f2c86613381eb0b83854d2acb11c14f56289b1b8 +Author: Paul Zimmermann +Date: Wed Sep 15 11:12:25 2021 +0200 + + added known issue + +commit f398b6b57af06b48761bb7d3caabf7e0761c911f +Merge: 9f837eb 65be45e +Author: ZIMMERMANN Paul +Date: Wed Sep 15 08:35:31 2021 +0000 + + Merge branch 'improve_tests' into 'master' + + Improve tests + + See merge request zimmerma/ecm!15 + +commit 9f837eb04aac7be2868ce5f72cdd43afe0977b48 +Merge: dfd8046 5b6825c +Author: ZIMMERMANN Paul +Date: Wed Sep 15 08:30:30 2021 +0000 + + Merge branch 'doc_power_pm1' into 'master' + + exit if -power or -dickson is used with P-1/P+1 + + See merge request zimmerma/ecm!10 + +commit 245af7b180ce803e2c90379210c3462b332d39ad +Author: Seth Troisi +Date: Fri Aug 20 02:27:55 2021 -0700 + + Added NVCC_CHECK_COMPILE to simplify gpu checks + +commit b97a2c1021a8250d2b1767f895692610f7138712 +Author: Seth Troisi +Date: Thu Aug 19 05:54:52 2021 -0700 + + Add back support for --enable-gpu=62 + +commit 3dca1e467f9c790f2cab2878054890d3b4c24a6d +Author: Seth Troisi +Date: Wed Aug 25 16:45:06 2021 -0700 + + array_stage_found -> array_found + + The code notices what stage the factor was found in and never mixes + stage 1 and stage 2 factors so array_found is a better name. + +commit 1529a4d648bd2978ada4da20322bebf1caf22ace +Author: Seth Troisi +Date: Wed Aug 25 16:27:41 2021 -0700 + + Add reducefactors to find primes from many composites in stage1 GPU code + +commit 65be45e89810c6367eaff018bf0ed45bb064e6c8 +Author: Seth Troisi +Date: Wed Sep 1 13:39:10 2021 -0700 + + Improve tests + + Cleanup checkcode $C + Verify some error strings + Create test_dummy2.save in tests (and remove from repo) + +commit 24a544fb4ece8da20ef94a990c1211409b2ca29b +Author: Seth Troisi +Date: Sat Aug 7 01:25:15 2021 -0700 + + Simplify get_curve_from_param[13] with mpz_invert + +commit dfd8046b32041a102fe15ad07a5a54e8aaff3f92 +Merge: 2c057e2 bb00c05 +Author: ZIMMERMANN Paul +Date: Tue Sep 14 14:41:17 2021 +0000 + + Merge branch 'execstack_otherintel' into 'master' + + Execstack otherintel + + See merge request zimmerma/ecm!7 + +commit 2c057e23c8f162a21f9015e2a83ec00b417278eb +Merge: 968cb8f e32a867 +Author: ZIMMERMANN Paul +Date: Tue Sep 14 14:39:27 2021 +0000 + + Merge branch 'git_ignore' into 'master' + + Moderately complete .gitignore + + See merge request zimmerma/ecm!12 + +commit 968cb8f72f8c7e2d0cfbee07f9334b22ce7918cd +Merge: 7157d2e ea3d295 +Author: ZIMMERMANN Paul +Date: Tue Sep 14 14:38:14 2021 +0000 + + Merge branch 'faster_expected_curves' into 'master' + + Faster expected curves + + See merge request zimmerma/ecm!13 + +commit 7157d2e7161f25d2d9b2d1509ed83ceb1729fc1a +Merge: 793dd86 b018e41 +Author: ZIMMERMANN Paul +Date: Tue Sep 14 14:35:45 2021 +0000 + + Merge branch 'sm_70_fix' into 'master' + + Fix cuda error in SM_70 for __any + + See merge request zimmerma/ecm!8 + +commit 793dd860e2f5d65bc949dee5974204e669bd3e5b +Merge: b3f5661 6c2e1db +Author: ZIMMERMANN Paul +Date: Tue Sep 14 14:32:13 2021 +0000 + + Merge branch 'gpu_verbose_tweak' into 'master' + + Better verbosity control for -gpu + + See merge request zimmerma/ecm!14 + +commit 6c2e1db93851690b6239bbc38b668ef7820e38f6 +Author: Seth Troisi +Date: Fri Sep 10 00:33:38 2021 -0700 + + Better verbosity control for -gpu + + `ecm -gpu -v` now prints + + Expected number of curves + Block size + Stage 1 time + Stage 1 throughput + Curve Step 2 took ms + Stage 2 time + Stage 2 throughput + Expected time to find a factor + + this removes the ~20 lines per stage2 that would otherwise print at -v + +commit ea3d2958a825413720213605e348ebe3a008cca9 +Author: Seth Troisi +Date: Thu Sep 9 19:45:03 2021 -0700 + + Move compute_s to after print_expcurves + +commit e32a867bb0788204c1fbf2af543a1e0a80eab16b +Author: Seth Troisi +Date: Thu Aug 19 07:50:54 2021 -0700 + + Moderately complete .gitignore + +commit 5b6825c537a55e79adc3eeaca3e7cae2c68b194a +Author: Seth Troisi +Date: Thu Sep 9 17:27:14 2021 -0700 + + exit if -power or -dickson is used with P-1/P+1 + + This has been broken since 2013 when the old PM1 stage 1 + was removed in git commit 36108424 + +commit d8f7afca3c0c1d4c21763dc086bb7dfed8cec243 +Author: Seth Troisi +Date: Mon Aug 30 19:35:33 2021 -0700 + + check_gpuecm.sage add smallestGroupOrder + + This allows for + $ sage --preparse check_gpuecm.sage + $ mv -f check_gpuecm.sage.py check_gpuecm.py + $ python -c "import check_gpuecm; check_gpuecm.smallestGroupOrder(594538100848945223169882301931953, 3, 1000, 10 ** 12, 1000)" + + Which can help find B1/B2 limits for tests where prime will be found. + +commit b018e41fd32c005b03771a452ac43d25802b187e +Author: Seth Troisi +Date: Wed Aug 25 16:27:04 2021 -0700 + + Fix cuda error in SM_70 for __any + +commit b3f5661cc60db68580aa29b4a9b95dae9962607d +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Fri Aug 6 15:08:53 2021 +0000 + + Update README - Add details about constraints on sigma. + +commit bb00c05b7f3e21a8eaaef1ce9e5dff96b8e7fe8f +Author: François Bissey +Date: Thu Jun 17 10:22:16 2021 +1200 + + Regenerate pentium4 asm files + +commit 69e38b5533cca1b02c13b276710a54122b42f2c1 +Author: François Bissey +Date: Thu Jun 17 10:21:12 2021 +1200 + + replace enumeration by sequence (pentium4) + +commit fea0e3a33573e03c9b8e1f3cefecbc055c67d637 +Author: François Bissey +Date: Thu Jun 17 10:07:55 2021 +1200 + + Add a no execstack block to generated asm files (pentium4) + +commit 7108cf5ed35f36a244964d74c4eb27fe55e64502 +Author: François Bissey +Date: Thu Jun 17 09:50:40 2021 +1200 + + fix autogen for py3 + +commit 981a332ef39c1530a22dac9528b31da0613daff8 +Author: François Bissey +Date: Thu Jun 17 09:48:42 2021 +1200 + + Regenerate all asm files + +commit fb41ea295ba00e6bb29db96e0bf10dad8516baa7 +Author: François Bissey +Date: Thu Jun 17 09:47:43 2021 +1200 + + Add a no execstack block to generated asm files + +commit 0356043c27c72486ec443bc5de6f63bd29110a51 +Author: François Bissey +Date: Thu Jun 17 09:37:40 2021 +1200 + + replace manual enumeration, bound to have mistakes sooner or later, with a sequence. + +commit 9c2bc110735d93973d24d33679c889a69c6743fc +Author: François Bissey +Date: Thu Jun 17 09:36:57 2021 +1200 + + add braces to print in autogen.py so it is py3 compliant + +commit c6c2eaba2d66d5d1a23c9a736aed8f40fb8e5a61 +Merge: 8c75c05 29c5847 +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Tue Jun 15 23:27:27 2021 +0000 + + Merge branch 'execstack' into 'master' + + Execstack part 1: x86_64 + + See merge request zimmerma/ecm!6 + +commit 29c5847ee3bfeb0d6a94f2401b4798cd55a96dc0 +Author: François Bissey +Date: Tue Jun 15 11:45:00 2021 +1200 + + Add exec stack code to mulredc1,asm as it is not generated. + +commit 1106ce1ec56d1ecfb8b82ee3b60644f41289eab9 +Author: François Bissey +Date: Tue Jun 15 11:20:54 2021 +1200 + + Add appropriate assembly code at the end of the m4 templates with appropriate quoting. + +commit ab8338134ac854cfda7e17eda01bb1084303b422 +Author: François Bissey +Date: Tue Jun 15 11:17:37 2021 +1200 + + We don't plan to rely on --noexecstak anymore so removing detection and inclusion + +commit 4013d15f6cc7d16e05b79a14e32116870f25f450 +Merge: 1f5b919 8c75c05 +Author: Brian Gladman +Date: Tue Jan 26 18:29:11 2021 +0000 + + Merge branch 'master' of gitlab.inria.fr:zimmerma/ecm + +commit 1f5b919e714e06364db3be2d1265755e4129d387 +Author: Brian Gladman +Date: Tue Jan 26 18:28:51 2021 +0000 + + Update to CUDA 11.2 + +commit 8c75c05d6272282c8c3f3730294e6109dd4351f8 +Merge: 59bdbef 3d26545 +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Wed Nov 18 20:00:12 2020 +0100 + + Merge branch 'update-todo' into 'master' + + Remove primality proving from TODO + + See merge request zimmerma/ecm!4 + +commit 3d2654571a187cbb24b3d19cf270c5fafb100687 +Author: David Cleaver +Date: Wed Nov 18 12:57:33 2020 -0600 + + Remove primality proving from TODO + +commit 59bdbef9c14ab95b17f86b7824739fa194902896 +Author: David Cleaver <11787-x-dcleav@users.noreply.gitlab.inria.fr> +Date: Wed Nov 18 19:27:28 2020 +0100 + + Add config file for gitlab ci + +commit 554ca5c95f8410e8dc5811c2d7d02b1945de53e8 +Author: Paul Zimmermann +Date: Fri Nov 6 15:06:51 2020 +0100 + + added tag for 6.0.1 + +commit d875a8beff22eba4729b0fe010e687fab6a79a04 +Author: Paul Zimmermann +Date: Wed Nov 4 10:44:57 2020 +0100 + + update README.dev for change to git + +commit 009acd453a0a29ccfb57ffdd74af6cfad48be437 +Author: Brian Gladman +Date: Tue Nov 3 14:56:40 2020 +0000 + + Reinstate a declaration required by Visual C's out of date Open MP version + +commit 940c757cda8e4b51edb3923b21040ca779584911 +Merge: 2d7cfb2 35af4b5 +Author: Brian Gladman +Date: Tue Nov 3 12:06:41 2020 +0000 + + Merge branch 'master' of gitlab.inria.fr:zimmerma/ecm + +commit 2d7cfb2d1e18721bf9ef0d93ecd5b0edd6e1937f +Author: Brian Gladman +Date: Wed Oct 28 17:04:04 2020 +0000 + + Update Visual Studio 2019 build to CUDA 11.1 + +commit 35af4b5ff0e74c8015dc543b55c7432d9b246f16 +Author: Paul Zimmermann +Date: Fri Oct 23 16:02:45 2020 +0200 + + added note + +commit cbfad99a5cd4f33a2c02c7b8bacaa099ebba7a81 +Merge: fa2b11e a61ff7b +Author: ZIMMERMANN Paul +Date: Fri Oct 23 12:27:26 2020 +0200 + + Merge branch 'clang_openmp' into 'master' + + unsigned long are now acceptable variable for openMP + + Closes #21857 + + See merge request zimmerma/ecm!3 + +commit a61ff7bd9b30f7d2ec8bda44a9c23f951a58333a +Author: François Bissey +Date: Fri Oct 23 22:10:12 2020 +1300 + + unsigned long are now acceptable variable for openMP + +commit fa2b11ea408c33ea62f14494db915d55e4f28302 +Author: Paul Zimmermann +Date: Fri Oct 23 10:14:40 2020 +0200 + + fixed typo + +commit 2349ef93fe4ab99b6dc3390b98bdfccbbbde4a04 +Author: Paul Zimmermann +Date: Thu Oct 22 10:40:23 2020 +0200 + + fixed memory leaks + + (reported by Bruno Victal ) + +commit 09c5a3634b2101c56a94755fe647b4e953b41ba8 +Author: Paul Zimmermann +Date: Tue Oct 20 16:49:37 2020 +0200 + + updated references to ecm-discuss list and archives + +commit 3566332e4d56c8bbe0d8d4e34001d8d107407a41 +Author: Paul Zimmermann +Date: Wed Oct 14 09:54:08 2020 +0200 + + fixed "make longcheck" (reported by Christoph Conrads) + +commit c8eb7861219d9b43f2652ab84f0b93a4fbae8540 +Author: Paul Zimmermann +Date: Fri Oct 9 16:31:18 2020 +0200 + + added Jim Fougeron in .mailmap + +commit 30611601cc78676deb73d7db65c4234793a78993 +Author: Paul Zimmermann +Date: Fri Oct 9 16:21:04 2020 +0200 + + added .mailmap + +commit 54b2c2197dd1a8a57a16f11777aff71a12a8e1be +Author: Paul Zimmermann +Date: Mon Sep 28 13:33:38 2020 +0000 + + fix another issue reported by Wang Runsen + (https://lists.gforge.inria.fr/pipermail/ecm-discuss/2020-August/004495.html) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3092 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 6124ff41afe7343080c0a2117d4ae83433e91eb9 +Author: Paul Zimmermann +Date: Mon Sep 28 13:12:23 2020 +0000 + + fixed bug reported by Wang Runsen + (https://lists.gforge.inria.fr/pipermail/ecm-discuss/2020-August/004492.html) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3091 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 058c64c80cdeb16f0ff8e8d409c57ff50db714d6 +Author: Paul Zimmermann +Date: Mon Sep 28 12:26:56 2020 +0000 + + fixed problem reported by Christoph Conrads + (https://gforge.inria.fr/tracker/index.php?func=detail&aid=21755&group_id=135&atid=626) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3090 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 8fbb2809afe72799098e07d7c74fef4d6febbf70 +Author: Paul Zimmermann +Date: Fri Sep 18 14:44:26 2020 +0000 + + fix for clang 12 from Apple's Xcode 12 + (https://gforge.inria.fr/tracker/?func=detail&atid=623&aid=21856&group_id=135) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3089 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 388d48e8997ef1b3cceffd84479425c98eb9898f +Author: Brian Gladman +Date: Mon Jul 27 12:16:12 2020 +0000 + + remove *.obj file + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3088 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b22add97119f0f7b558040ac7d15a5bdb501a4c1 +Author: Brian Gladman +Date: Mon Jul 27 11:40:30 2020 +0000 + + update Visual Studio 2019 build to use CUDA 11.0 + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3087 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 69c359dc5b4996449bd50ff8838edf90bc74765e +Author: Brian Gladman +Date: Wed Jul 8 15:09:06 2020 +0000 + + update Visual Studio 2019 build to use CUDA 11.0 + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3086 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit ca7d4e2bda8f42bbf9ab60d201fa5496e012531c +Author: Paul Zimmermann +Date: Fri May 22 07:47:51 2020 +0000 + + [INSTALL-ecm] added note related to GWNUM + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3085 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit cf5e6273abe14d00878ebd62cc15e024adb1b8ad +Author: Paul Zimmermann +Date: Thu May 21 06:39:58 2020 +0000 + + [Fgw.c] fix bug reported at https://mersenneforum.org/showthread.php?t=25559 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3084 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 114aa487e16c678eb95acb63c4d51450be33c46a +Author: Paul Zimmermann +Date: Thu May 21 06:08:45 2020 +0000 + + [INSTALL-ecm] updated latest GWNUM version + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3083 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 2f35ffb44690ec51c6f1f9e15fd742ec41e90212 +Author: Paul Zimmermann +Date: Tue Apr 21 07:25:26 2020 +0000 + + partly revert previous commit (+ cast) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3082 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 9e20ec01d4cbf11acbef5b31f47ef09fffa005d2 +Author: Paul Zimmermann +Date: Sun Apr 19 06:35:39 2020 +0000 + + use AC_PROG_CC_C99 + print fix + (https://lists.gforge.inria.fr/pipermail/ecm-discuss/2020-April/004485.html) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3081 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 4ec21e29f65115a4fa36fe0d1a6e08366a82f464 +Author: Paul Zimmermann +Date: Fri Mar 20 15:17:28 2020 +0000 + + added comment + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3080 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 82d017b69ec96d50babdc57b11a012d9501eee03 +Author: Paul Zimmermann +Date: Fri Mar 20 15:09:52 2020 +0000 + + added ecm_reset() + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3079 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit c72938126f3c6be2cbd548958c85b06b30ed6f78 +Author: Paul Zimmermann +Date: Thu Mar 19 13:44:31 2020 +0000 + + added patch to solve bug [#21836] Cross-Compiling to mingw32 terminates with "memusage.c:72: undefined reference to `GetProcessMemoryInfo@12'" + + https://gforge.inria.fr/tracker/index.php?func=detail&aid=21836&group_id=135&atid=623 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3078 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit bcb4c1baff86cac4981190a1723ab110742488d8 +Author: Paul Zimmermann +Date: Thu Mar 19 09:58:41 2020 +0000 + + added -std=c99 to CFLAGS + (https://gforge.inria.fr/tracker/index.php?func=detail&aid=21835&group_id=135&atid=623) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3077 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 18bc4a8cededb917c115ab35b714e3d736c8974d +Author: seth troisi +Date: Tue Jan 28 08:44:04 2020 +0000 + + Improve check_gpuecm.sage by using order of P(x0=2,y0=1) + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3076 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 18f12a3cdc398be1b74e2e64867dacda632c3bce +Author: seth troisi +Date: Mon Jan 27 00:02:22 2020 +0000 + + Added check_gpuecm.sage script + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3075 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 8ebe0590558f50af0e445ce90eba8beedefe4d92 +Author: seth troisi +Date: Fri Jan 24 04:21:15 2020 +0000 + + Add test to verify number of factors found by gpucurves + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3074 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit a8fa12f0e18090cb0e3abaaf89dc392cd1739c61 +Author: seth troisi +Date: Wed Jan 22 10:36:50 2020 +0000 + + Revert changes to FindGroupOrderA in r3072 + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3073 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 1d664f31bc4ea0dcd13b194a53aba5b288020575 +Author: seth troisi +Date: Wed Jan 22 09:19:35 2020 +0000 + + Simplify FindGroupOrder in check.sage + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3072 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit dae573d6fffb8e2aa20fba16794376896cdaafa6 +Author: seth troisi +Date: Fri Jan 17 23:24:33 2020 +0000 + + Fix man page description of -param + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3071 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit aec9d06ea7095bebf84c603c9e3fbf1656776054 +Author: seth troisi +Date: Fri Jan 17 03:49:22 2020 +0000 + + add -primetest and -param to man page + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3070 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 42060008c7a0ad1fa5fcb32cf7d69f69cd028996 +Author: seth troisi +Date: Wed Jan 15 06:42:13 2020 +0000 + + Added 'APR primality test:' to status output + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3069 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit fcab1a1192fef51c26c58940307bcbf024ec5753 +Author: Paul Zimmermann +Date: Mon Nov 25 14:09:17 2019 +0000 + + fix test for "-bsaves/-bloads can only be used with..." + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3068 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 42feec8b941f483313023214d7ed735bc3c86d55 +Author: Paul Zimmermann +Date: Tue Nov 12 10:24:01 2019 +0000 + + fix bug reported on https://lists.gforge.inria.fr/pipermail/ecm-discuss/2019-November/004451.html + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3067 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit dc0bc24846c5fb863ae7a595fbd9979c69b18f0d +Author: Paul Zimmermann +Date: Tue Nov 12 08:44:00 2019 +0000 + + removed temporary code from revision 3064 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3066 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 4abd865f3c7adad08ca44e09012856156a6d0a9d +Author: Chen Wang +Date: Sat Nov 9 16:10:14 2019 +0000 + + [test.ecm] add test cases for PhiL/M (for each base), primU and GCD. + [eval.c] restored eval_GCD; fixed bug in eval_phi where phi(p^n,1) is calculated as 1. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3065 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 063f488fcbf24c2fa240be7505f6d93126337ab2 +Author: Paul Zimmermann +Date: Wed Nov 6 19:07:40 2019 +0000 + + disable temporarily eval_gcd until it is tested by "make check" + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3064 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit f43ebb563ca857bb81460f88f128d42392ebc85f +Author: Chen Wang +Date: Wed Nov 6 18:44:41 2019 +0000 + + Removed a leftover debug fprintf + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3063 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 867b4a06da4712d7d58d120f37c9ea15318a3840 +Author: Chen Wang +Date: Wed Nov 6 18:36:19 2019 +0000 + + Crude implementation of "Aurifeullian primitive part" functions PhiL and PhiM + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3062 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit e5a5e4c05d5d3f104ad0e26286251f9691921a7e +Author: Chen Wang +Date: Tue Nov 5 04:17:42 2019 +0000 + + make the parser actually recognize gcd function + + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3061 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 3a8bb7c466ae04ef47f68e363cc70392f6493bfd +Author: Chen Wang +Date: Wed Oct 30 14:36:33 2019 +0000 + + Added a GCD function + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3060 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 3a8c9891c0f2eb86b27595e5909a9199da6535b6 +Author: Paul Zimmermann +Date: Tue Oct 22 11:11:52 2019 +0000 + + added more comments + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3059 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit bd98fd3fc035c36e6bdd5657600df8403b8e0be4 +Author: Paul Zimmermann +Date: Tue Oct 22 10:56:10 2019 +0000 + + fixed memory leak and added comment + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3058 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 1456a3c66e98ab490d2df72d1f38d01c4a764692 +Author: Paul Zimmermann +Date: Tue Oct 15 11:12:48 2019 +0000 + + added comment + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3057 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit ce1ee5ce6fa5a6285b1a6a4a32c7e55b5f9f79eb +Author: Paul Zimmermann +Date: Tue Oct 15 11:12:19 2019 +0000 + + added probability of finding a factor for P+1 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3056 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 55c49f4b6539ddeb4d5541cc7100fe1145ddb091 +Author: Paul Zimmermann +Date: Tue Oct 15 10:27:54 2019 +0000 + + added reference + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3055 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit edcd2ccf968506f4303373a013cbdc1cf19510d6 +Author: Paul Zimmermann +Date: Mon Oct 14 19:48:27 2019 +0000 + + added "assuming one exists" to "Probability of finding a factor of n digits" + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3054 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit cebdd049d9d0a3f4083edcb1b89088d75a66b3b7 +Author: Brian Gladman +Date: Tue Oct 1 15:27:20 2019 +0000 + + Revise the explanation of the use of bash to run the tests on Windows + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3053 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit ef9830e0803d02a518cc994a249910096ebe3d34 +Author: Paul Zimmermann +Date: Tue Aug 20 08:38:15 2019 +0000 + + fixed typos + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3052 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 4c7ab86b7892798e6db731b3d9f496851f20eba6 +Author: Brian Gladman +Date: Tue Aug 20 08:12:47 2019 +0000 + + Add a description of how to run the tests on Windows using bash + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3051 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b4c70e3bcc7d0f178e5962720da1d84323d1ef3c +Author: Paul Zimmermann +Date: Tue Jun 11 12:53:23 2019 +0000 + + fixed again mpzspv_from_mpzv_fast (the solution from r3047 was incorrect, + since mpzspv_from_mpzv_fast might be called twice at the same time with the + same thread number) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3050 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 3c6ddc8d8a571b3db14716e6997625bf246ed6fd +Author: Brian Gladman +Date: Thu May 16 16:34:52 2019 +0000 + + Add build files for Visual Studio 2019 (in build.vs) + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3049 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 93c7660da6dd4eed27f63772fd31a9bdd23ff7f1 +Author: Paul Zimmermann +Date: Fri May 10 13:27:24 2019 +0000 + + added note about issue with GWNUM + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3048 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 59e7682aed93d0e267101b1b642fae5f44683697 +Author: Paul Zimmermann +Date: Fri May 3 13:30:47 2019 +0000 + + avoid alloc/free in mpzspv_from_mpzv_fast() + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3047 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit d6d274a371359824fa408ad300dfb64ba02abde1 +Author: Paul Zimmermann +Date: Fri May 3 11:08:35 2019 +0000 + + fixed mpzspv_from_mpzv_fast() [was not thread-safe] + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3046 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit cbd34997261056d1ac0239323aa388d85240b8f5 +Author: Paul Zimmermann +Date: Fri May 3 08:04:29 2019 +0000 + + fixed another typo + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3045 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b2ea1a4d135e5f37b5a6f5a1f968ae268e4f2ed8 +Author: Paul Zimmermann +Date: Fri May 3 08:00:44 2019 +0000 + + fixed compiler warning + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3044 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 0c5809a31df52e14b9b0f111980579453c31eccb +Author: Paul Zimmermann +Date: Fri May 3 08:00:20 2019 +0000 + + fixed typo + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3043 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 85ac25d86487c04f8456bd10dab674785790c028 +Author: Paul Zimmermann +Date: Thu May 2 15:57:54 2019 +0000 + + fixed typo + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3042 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 8cbcd1eee399777cde48df9ef9ad7ce9c5d5a647 +Author: Paul Zimmermann +Date: Thu May 2 08:49:07 2019 +0000 + + need -ldl with GWNUM + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3041 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit c42aeb6b76c2b9c403bf5764147af31cee046f45 +Author: Paul Zimmermann +Date: Thu May 2 08:43:10 2019 +0000 + + updated paragraph about GWNUM + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3040 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 2dd171624b4940aca530a80433ad8841b7aee53b +Author: Paul Zimmermann +Date: Tue Apr 30 15:11:42 2019 +0000 + + several fixes for GWNUM + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3039 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit cfb68d8744904907d299657d39d6d6c214114a07 +Author: Paul Zimmermann +Date: Wed Apr 3 10:52:02 2019 +0000 + + clarify "default B2" + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3038 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit a06a8fa556f2c713170cffb12082b5cec405a04e +Author: Chen Wang +Date: Sat Jan 26 02:03:23 2019 +0000 + + Unified interface for function parsing. + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3037 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 0e0a9743a0f7187f25deeb20a459ee7373fc0471 +Author: Paul Zimmermann +Date: Tue Jan 15 16:01:24 2019 +0000 + + [Makefile.am] added -fPIC for nvcc + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3036 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit ac64652f2b20b8089fbb91489ce65711ac38f8a4 +Author: Paul Zimmermann +Date: Tue Jan 15 12:16:49 2019 +0000 + + [gpu-ecm] added compute capability 60 61 62 70 72 75 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3035 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 42e817926a5091c3c5e01d0ce053bf2be284d333 +Author: Brian Gladman +Date: Tue Dec 4 22:44:02 2018 +0000 + + correct bug in YASM integration file (vsyasm.targets) + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3034 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit a913e07f72a3031a71ceb4e0c3e692d1357cdb83 +Author: Chen Wang +Date: Mon Nov 26 23:08:20 2018 +0000 + + Added parsable functions U(p,q,n) and primU(p,q,n) as parsable functions. + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3033 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 4ff4b164b26fd79486abcb845848d77a70f6b1e1 +Author: Brian Gladman +Date: Fri Oct 26 23:24:27 2018 +0000 + + update Visual Studio 2017 build to CUDA 10.0 + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3032 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 6bbea2905c39c4ac83952f47e9cf68c903020f7e +Author: Paul Zimmermann +Date: Fri Jul 20 15:25:08 2018 +0000 + + [pm1.c] fixed bug reported by Anton Repko when B2 < B1 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3031 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 21d62ff1ab094f8eb252dabbc8498d9e0dc40f98 +Author: Paul Zimmermann +Date: Mon Jun 18 12:16:16 2018 +0000 + + fixed wrong documentation (reported by Tanaydın Åžirin ) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3030 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit d9dacc459c43a5ea58c2fa09b6f6ab5ec55f0b53 +Author: Brian Gladman +Date: Thu Jun 7 17:16:57 2018 +0000 + + update to CUDA 9.2 + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3029 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit e62771c8821dfb9794a94a13f6e5f73b8af2fd82 +Author: Paul Leyland +Date: Sun Dec 10 15:29:46 2017 +0000 + + Determine CUDA minimum compute capability at configure time and set + -DECM_GPU_CURVES_BY_BLOCK=16 for cc2.x + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3027 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 72ec461885d7945bd2ea0ef8981a9a912b0ff34b +Author: Paul Leyland +Date: Sun Dec 10 13:04:02 2017 +0000 + + Corrected minor typos + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3026 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b3b48cce63e180f8bb0c5919b6ea0cae7d22c2a5 +Author: Paul Leyland +Date: Sun Dec 10 11:00:21 2017 +0000 + + Set -gpucurves 32 or small cards fail the final two tests. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3025 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 09ddf0bc538278fa5c803aabe38d11f037d9dfba +Author: Brian Gladman +Date: Sun Nov 5 14:28:38 2017 +0000 + + set up the build for static linking + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3024 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 9d809aba3463c2a05a749d5bb80feac9cc604b74 +Author: Brian Gladman +Date: Fri Nov 3 16:50:23 2017 +0000 + + Add Visual Studio 2017 build (with CUDA 9) + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3023 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit a66397a96296a9b96191bb068a9cc8e9ebcda71c +Author: Paul Zimmermann +Date: Tue May 16 11:09:39 2017 +0000 + + fix for "make check -j" + + (see https://trac.sagemath.org/ticket/20385#comment:161) + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3022 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 0654e369e9daa9fd130cf9e17fb3a1da4cc1be6d +Author: Paul Zimmermann +Date: Mon Apr 24 14:53:38 2017 +0000 + + add in README that for ECM, -chkpnt is only implemented with -param 0 so far + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3021 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 6f2849903f089920594a102afb3abc43276d083a +Author: Paul Zimmermann +Date: Mon Apr 24 14:50:54 2017 +0000 + + say that -chkpnt only works with -param 0 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3020 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit fe6f87b349d0e68b4cdb31580f61a115addc363b +Author: Cyril Bouvier +Date: Wed Mar 1 22:04:17 2017 +0000 + + Update code from gpu_ecm: param is no longer a pointer + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3019 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 307a0737c6c67e543aa537c035b3dc3e5b055a9f +Author: Cyril Bouvier +Date: Wed Mar 1 22:03:57 2017 +0000 + + Update ecm prototype in ecm.h: param is not longer a pointer + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3018 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 0d5e92d2247316ab300ebc65f69120111eb4ad31 +Author: Paul Zimmermann +Date: Wed Mar 1 18:35:42 2017 +0000 + + fixed a bug reported by Sam Wagstaff + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3017 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 5876a444f1b270a2f987b19a8b60ffebed6967bc +Author: David Cleaver +Date: Sun Feb 12 22:36:35 2017 +0000 + + - Finalize using ecm_tstbit in code where needed. + - Update B1 bounds for Windows, 3124253146UL for GMP <= 6, 50685770166ULL for MPIR. + - Clean up code formatting in batch.c + - Minor changes to fix some compiler warnings. + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3016 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 1a1373c7ecab8d32d0df8f78f338845c07d481f5 +Author: Paul Zimmermann +Date: Mon Feb 6 10:22:11 2017 +0000 + + changed B1 < MAX_B1_BATCH into B1 <= MAX_B1_BATCH + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3015 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 1d28f9f8b4c7a8f7983af597804c4bdbb98404a8 +Author: David Cleaver +Date: Sat Feb 4 18:33:21 2017 +0000 + + Fix for systems with 32-bit longs that would miss factors when using GMP<=6.x, with B1>2977044736UL. + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3014 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit cfcd5fe3f9d26c47924169d3538a868356c769b6 +Author: Cyril Bouvier +Date: Mon Nov 7 10:21:09 2016 +0000 + + Revert previous commit + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3013 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 2d344c453bf183930b5a392d99d876b32f604784 +Author: Cyril Bouvier +Date: Mon Nov 7 09:30:40 2016 +0000 + + Trying to fix a compilation error with CUDA7.5 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3012 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 80017fca3e6b86fba2d2265984c415f89109c372 +Author: Francois Morain +Date: Mon Oct 24 15:54:15 2016 +0000 + + 90%+90%. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3011 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 2ac643a44a889639bd89aa0d7db5995646acdd54 +Author: Francois Morain +Date: Sun Oct 23 17:00:11 2016 +0000 + + first time above 90%. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3010 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 4b29160faf36af00ab382b30d6ce1bed8bbf516b +Author: Francois Morain +Date: Sat Oct 22 18:58:53 2016 +0000 + + one more test. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3009 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b1034e2d4b99bd2bcc0ae2d986a35cf5a2b35d96 +Author: Francois Morain +Date: Sat Oct 22 07:46:41 2016 +0000 + + more and more cleaning. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3008 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit abc73751e788ac07f804bd9156f80ee4d14b51b6 +Author: Francois Morain +Date: Sat Oct 22 07:30:55 2016 +0000 + + cleaning. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3007 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit f19d4f9e5042caa617758d035249274818769423 +Author: Francois Morain +Date: Fri Oct 21 19:01:08 2016 +0000 + + printing is needed in debug mode only. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3006 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 5b5720b055dcab14b6efdded7224e603b2dafaa6 +Author: Francois Morain +Date: Thu Oct 20 17:10:17 2016 +0000 + + Still more covering of addlaws.c, finding more bugs. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3005 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit ad816ada2e6932a5fb8fc1983122b1ee15574b41 +Author: Francois Morain +Date: Thu Oct 20 11:36:56 2016 +0000 + + more cleaning and code moving to improve coverage of addlaws.c . + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3004 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 1e99299988a9ccb4c48d10acf0cface5b3839737 +Author: Francois Morain +Date: Thu Oct 20 11:22:00 2016 +0000 + + some info about gcov. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3003 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit c38e1fd5797690dcbee02e8bcf6359d191265984 +Author: Francois Morain +Date: Wed Oct 19 13:37:38 2016 +0000 + + more simplification of addlaws.c . + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3002 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b792ea465506d467b545797d01b58947297c3a9d +Author: Francois Morain +Date: Tue Oct 18 15:45:50 2016 +0000 + + torsions.c is covered up to errors that cannot be simulated. More work + is needed for addlaws.c, in progress. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3001 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 61ff7f30b1aeb3bc33a0470ae1411a39edcf7e55 +Author: Francois Morain +Date: Sun Oct 16 16:53:06 2016 +0000 + + More covering of Z2xZ8, Z4xZ4 and covering for Z3xZ3. Still missing is Z3xZ6. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@3000 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 81baa9f034a1628a179a346413756f3eb49edc5f +Author: Francois Morain +Date: Fri Oct 14 20:05:04 2016 +0000 + + More covering of Z9, Z2xZ8; simplification of Z10; cleaning of torsions.c . + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2999 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 27708faaf8331372e8c357d4af9e67bc8874f6ee +Author: Francois Morain +Date: Thu Oct 13 13:04:56 2016 +0000 + + advancing in the covering of torsions.{c,h} . Not finished. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2998 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit ab4234f77a9ae8cf24993428e47df0fc745cd7a6 +Author: Francois Morain +Date: Wed Oct 12 16:41:10 2016 +0000 + + reintroducing all tests => some cleaning made after a bug occurred during + the old tests. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2997 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 0f74c0baa5d7670dc39778c47eb0041ea9ad79d2 +Author: Francois Morain +Date: Wed Oct 12 08:18:18 2016 +0000 + + reinjecting tests for Weierstrass form only. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2996 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 421fc4baad95a95895a0a432d6685e37029ef5d0 +Author: Francois Morain +Date: Tue Oct 11 16:14:57 2016 +0000 + + Forgot. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2995 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit 2da354c493c2969152705f6e0a6ab30fb4652dea +Author: Francois Morain +Date: Tue Oct 11 12:52:31 2016 +0000 + + introducing JKL curves + projective twisted Hessian curves a la Bernstein. + More cover tests are to be added soon. + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2994 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit b43a451e72d4226c578253f821e0069675569260 +Author: Paul Zimmermann +Date: Tue Oct 11 09:34:42 2016 +0000 + + version is now 7.0.5-dev + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2993 404564d9-a503-0410-82bf-e18ce2cf3989 + +commit fcc85095a977d7620c50923cb0d0d9551d1eaa5f +Author: Paul Zimmermann +Date: Tue Oct 11 09:29:07 2016 +0000 + + prepare for release 7.0.4 + + + git-svn-id: svn://scm.gforge.inria.fr/svnroot/ecm/trunk@2991 404564d9-a503-0410-82bf-e18ce2cf3989 ------------------------------------------------------------------------ r2990 | brian_gladman | 2016-09-29 13:13:04 +0200 (Thu, 29 Sep 2016) | 1 line Changed paths: @@ -127,7 +1893,7 @@ Changed paths: M /trunk/configure.ac -patch from François Bissey to address issue on Mac OS X +patch from François Bissey to address issue on Mac OS X ------------------------------------------------------------------------ r2981 | zimmerma | 2016-08-25 15:44:58 +0200 (Thu, 25 Aug 2016) | 2 lines @@ -215,7 +1981,7 @@ M /trunk/README.gpu [README.gpu] added hint about Cuda error "too many resources requested for launch" -(see https://lists.gforge.inria.fr/pipermail/ecm-discuss/2016-July/004368.html) +(see https://sympa.inria.fr/sympa/arc/ecm-discuss/2016-07/msg00004.html) ------------------------------------------------------------------------ r2968 | dcleaver | 2016-07-31 20:12:55 +0200 (Sun, 31 Jul 2016) | 1 line @@ -233,7 +1999,7 @@ M /trunk/test.gpuecm increased initial sigma so that test works with ECM_GPU_CURVES_BY_BLOCK=16 -(see https://lists.gforge.inria.fr/pipermail/ecm-discuss/2016-July/004368.html) +(see https://sympa.inria.fr/sympa/arc/ecm-discuss/2016-07/msg00004.html) ------------------------------------------------------------------------ r2966 | dcleaver | 2016-07-24 16:06:00 +0200 (Sun, 24 Jul 2016) | 1 line @@ -315,7 +2081,7 @@ M /trunk/main.c fixed duplicate output with -q reported by Par Kurlberg -(https://lists.gforge.inria.fr/pipermail/ecm-discuss/2016-June/004361.html) +(https://sympa.inria.fr/sympa/arc/ecm-discuss/2016-06/msg00004.html) ------------------------------------------------------------------------ r2957 | zimmerma | 2016-06-17 17:37:53 +0200 (Fri, 17 Jun 2016) | 2 lines @@ -402,7 +2168,7 @@ M /trunk/parametrizations.c removed debug stuff (it was a compiler issue, cf -https://lists.gforge.inria.fr/pipermail/ecm-discuss/2016-June/004360.html) +https://sympa.inria.fr/sympa/arc/ecm-discuss/2016-06/msg00003.html) ------------------------------------------------------------------------ r2945 | zimmerma | 2016-06-15 14:22:37 +0200 (Wed, 15 Jun 2016) | 2 lines @@ -3604,7 +5370,7 @@ M /trunk/test.pm1 fixed bug reported by Andrew Booker -(http://lists.gforge.inria.fr/pipermail/ecm-discuss/2013-March/004214.html): +(https://sympa.inria.fr/sympa/arc/ecm-discuss/2013-03/msg00000.html): when B1=1, the -go parameter was ignored for P-1 ------------------------------------------------------------------------ @@ -7995,7 +9761,7 @@ M /trunk/mpmod.c fixed various problems with ecm-6.4.1-rc1 reported by David Cleaver -(http://lists.gforge.inria.fr/pipermail/ecm-discuss/2012-March/004144.html): +(https://sympa.inria.fr/sympa/arc/ecm-discuss/2012-03/msg00024.html): * replaced unsigned long by mp_limb_t in batch=1 (under Windows, unsigned long has 32 bits only) * configure.in: added -lm for mathematical functions @@ -10168,7 +11934,7 @@ Changed paths: M /trunk/Makefile.am -[Makefile.am] better fix for #10648 (contributed from Vincent Lefèvre) +[Makefile.am] better fix for #10648 (contributed from Vincent Lefèvre) ------------------------------------------------------------------------ r1518 | zimmerma | 2010-07-01 17:38:51 +0200 (Thu, 01 Jul 2010) | 4 lines @@ -10944,7 +12710,7 @@ M /trunk/test_mulredc.c M /trunk/tune.c -check return value of malloc in several places (bug reported by Torbjörn Granlund) +check return value of malloc in several places (bug reported by Torbjörn Granlund) ------------------------------------------------------------------------ r1413 | zimmerma | 2009-04-25 23:34:58 +0200 (Sat, 25 Apr 2009) | 2 lines @@ -11549,7 +13315,7 @@ A /trunk/mul_fft-params.h.pentium4 M /trunk/mul_fft.c -Moved parameters for Schönhage-Strassen into separate file so that +Moved parameters for Schönhage-Strassen into separate file so that tune output does not overwrite them ------------------------------------------------------------------------ @@ -12697,7 +14463,7 @@ Changed paths: M /trunk/README -Updated to NTT and SchönhageStrassen sections +Updated to NTT and SchönhageStrassen sections ------------------------------------------------------------------------ r1215 | kruppa | 2008-04-17 11:30:23 +0200 (Thu, 17 Apr 2008) | 2 lines diff -Nru gmp-ecm-7.0.4+ds/check_gpuecm.sage gmp-ecm-7.0.5+ds/check_gpuecm.sage --- gmp-ecm-7.0.4+ds/check_gpuecm.sage 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/check_gpuecm.sage 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,435 @@ +# script to spot check ecm -gpu stage 1 implementation. +# +# Computes order of a elliptic curve mod a prime to determine if that prime +# will be found at a specified B1 level, then runs ecm -gpu to verify that +# those primes are indeed found. +# +# to perform a quick test: 5 iterations of 32 curves at B1=10000 and B1=100000 +# $ sage check_gpuecm.sage ./ecm +# for even faster check +# $ sage check_gpuecm.sage --iterations 1 ./ecm +# +# to test with ECM_GPU_NB_DIGITS=16 and extra verbose output +# $ sage check_gpuecm.sage --nbits 512 -v -v ./ecm +# +# to test more iterations (~12 minutes) or more curves per iteration +# $ sage check_gpuecm.sage --iterations 20 ./ecm +# $ sage check_gpuecm.sage --gpucurves 128 ./ecm +# +# to see all options +# $ sage check_gpuecm.sage -h +# +# On error, will exit with return code 1 and stdout +# 'Wrong result for seed=' +# after checking with a clean build please email Seth Troisi or ecm-discuss +# with seed (and any non standard options you may have used). + + +# To suppress '*** Warning: increasing stack size to 2000000.' from pari. +pari.allocatemem(4000000, silent=True) + +import argparse +import random +import re +import subprocess +import sys + +parser = argparse.ArgumentParser(description='Spot check for ecm -gpu stage1') + +parser.add_argument('ecm_cmd', type=str, + help='which ecm (e.g. ecm, ./ecm) to run') + +parser.add_argument('--iterations', type=int, default=5, + help='Number of tests (small + large) to perform') + +parser.add_argument('-c', '--gpucurves', type=int, default=32, + help='number of curves to test in a batch [default: 32]') + +parser.add_argument('--B1', type=int, default=10000, + help='B1 to test at [default: 10,000]') + +parser.add_argument('--seed', type=int, default=None, + help='Random seed') + +parser.add_argument('--nbits', '-n', type=int, default=1024, + help='Only needed if ECM_GPU_NB_DIGITS was adjusted') + +parser.add_argument('--timing', action='store_true', + help='Producing timing information') + +parser.add_argument('--verbose', '-v', action='count', default=1, + help='Print more output (pass -v -v for even more)') +parser.add_argument('--quiet', '-q', + action='store_const', const=0, dest='verbose', + help='Suppress most output') + + + +# Currently GPU can only do param=3 +GPU_PARAM = 3 + +FACTOR_FOUND_RE = re.compile( + 'factor ([0-9]*) found in Step 1.*-sigma [0-4]:([0-9]*)\)') + + +def CurveParam0(p, sigma): + K = GF(p) + v = K(4*sigma) + u = K(sigma^2 - 5) + x = u^3 + b = 4*x*v + a = (v-u)^3*(3*u+v) + A = a/b-2 + x = x/v^3 + b = x^3 + A*x^2 + x + return EllipticCurve(K,[0,A/b,0,1/b^2,0]) + +def PointOrderS(p, s): + K = GF(p) + A = K(4 *s - 2) + b = K(16*s + 2) + E = EllipticCurve(K,[0,A/b,0,1/b^2,0]) + return E(2/b,1/b) # x0=2, y0=1 + +# From parametrizations.c +def CurveParam2(p, sigma): + K = GF(p) + E = EllipticCurve(K,[0,36]) + P = sigma*E(-3,3) + x,y = P.xy() + x3 = (3*x+y+6)/(2*(y-3)) + A = -(3*x3^4+6*x3^2-1)/(4*x3^3) + d = K((A+2)/4) + return PointOrderS(p, d) + +def CurveParam1(p, sigma): + K = GF(p) + return PointOrderS(p, K(sigma^2 / 2^64)) + +def CurveParam3(p, sigma): + K = GF(p) + return PointOrderS(p, K(sigma / 2^32)) + +def GroupOrder(param, prime, sigma): + if param == 0: + ec = CurveParam0(prime, sigma) + elif param == 1: + ec = CurveParam1(prime, sigma) + elif param == 2: + ec = CurveParam2(prime, sigma) + elif param == 3: + ec = CurveParam3(prime, sigma) + else: + raise ValueError('Unknown param: ' + str(param)) + + return ec.order() + + +def testInternal(): + # Verify code is working. + + # echo '78257675131877111603' | ecm -sigma '0:4528' 3100 8000-9000 + assert GroupOrder(0, 78257675131877111603, 4528) == \ + 2^3 * 3 * 71 * 563 * 1531 * 2153 * 3011 * 8219 + # echo '78257675131877111603' | ecm -sigma '1:3396' 4800 8000-9000 + assert GroupOrder(1, 78257675131877111603, 3396) == \ + 2 * 3 * 223 * 271 * 811 * 821 * 4799 * 8443 + # echo '78257675131877111603' | ecm -sigma '2:1801' 2100 9000-9300 + assert GroupOrder(2, 78257675131877111603, 1801) == \ + 2^8 * 3^3 * 23 * 41 * 47 * 67 * 101 * 2039 * 9257 + # echo '78257675131877111603' | ecm -sigma '3:2012' 6000 8000-9000 + assert GroupOrder(3, 78257675131877111603, 2012) == \ + 2^2 * 5^3 * 43^2 * 71 * 83 * 139 * 5779 * 8941 + + # echo '1082500099132634560519' | ecm -sigma '0:6677' 1200 5000-6000 + assert GroupOrder(0, 1082500099132634560519, 6677) == \ + 2^2 * 3 * 5^2 * 7 * 139 * 677 * 887 * 947 * 1123 * 5807 + # echo '1082500099132634560519' | ecm -sigma '1:1800' 4000 6000-7000 + assert GroupOrder(1, 1082500099132634560519, 1800) == \ + 2^5 * 13 * 17 * 79 * 701 * 2647 * 3347 * 6367 + # echo '1082500099132634560519' | ecm -sigma '2:2966' 2000 7000-8000 + assert GroupOrder(2, 1082500099132634560519, 2966) == \ + 2^3 * 3 * 29 * 31^2 * 61 * 109 * 487 * 1709 * 7499 + # echo '1082500099132634560519' | ecm -sigma '3:1600' 2000 3000-3100 + assert GroupOrder(3, 1082500099132634560519, 1600) == \ + 3^3 * 7 * 23^2 * 37 * 67 * 71 * 1297 * 1933 * 3067 + + # From Zimmermann, https://homepages.cwi.nl/~herman/Zimmermann.pdf + assert GroupOrder(0, 322410908070969630339041359359164154612901586904078700184707, 20041348) == \ + 2^4 * 3^2 * 391063 * 1197631 * 82011967 * 126033329 * 1926338723 * 4654300159 * 51585518429 + + # From David Broadhurst, https://sympa.inria.fr/sympa/arc/ecm-discuss/2005-09/msg00022.html + assert GroupOrder(0, 2580118483169716809210552261225054520765090617558895237, 161957884569085) == \ + 2^2 * 3 * 1483 * 91381 * 103231 * 239587 * 1151317 * 1186033 * 1611697 * 4199071 * 6941601157 + + # From David Broadhurst, https://sympa.inria.fr/sympa/arc/ecm-discuss/2005-09/msg00020.html + assert GroupOrder(0, 73372650975767950626979890709193208431269141871367229612025497, 175923) == \ + 2^2 * 3^2 * 13 * 41 * 3389 * 3989 * 1662013 * 2782993 * 5013037 * 94921033 * 1144363489 * 112303943380877 + + # From David Broadhurst, https://sympa.inria.fr/sympa/arc/ecm-discuss/2005-09/msg00032.html + assert GroupOrder(0, 6314722182591714308391592266483806595696758378370807102207443753223500809, 2481305347) == \ + 2^3 * 3^6 * 11 * 13^2 * 17^4 * 31^2 * 53^2 * 163 * 449 * 853^2 * 3923^2 * 7489 * 11113 * \ + 23459^2 * 116531 * 1016891 * 580801721 + + if args.verbose: + print('Implementation successfully tested\n') + + +def smallestGroupOrder(prime, param, sigma_0, num_curves): + ''' + Print smallest B1 (and B2) needed to find a curve from sigma_0 ... sigma_0 + curves + At the end return the smallest sigma + ''' + # This is not used directly in check_gpuecm.sage, but is very useful for building tests + found_at = None + smallest = 10 ^ 100 + for sigma in range(sigma_0, sigma_0 + num_curves): + order = GroupOrder(param, prime, sigma) + assert 1 <= order < 2 * prime, (prime, param, sigma, order) + + f = factor(order) + # Largest prime + exp = sorted(p ** k for p, k in f) + # B1 to find this factor is 2nd smallest if B2 = 500 * B1 finds large factor + min_b1 = exp[-2] if len(exp) >= 2 and exp[-1] < exp[-2] * 500 else exp[-1] + if min_b1 <= smallest: + print ("\tFound by sigma: %d with B1=%d, B2=%d" % (sigma, min_b1, exp[-1])) + smallest = min_b1 + found_at = sigma + return found_at + + +def smallGroupOrders(prime, param, sigma_0, test_B1, num_curves): + '''Find list of sigmas that will find prime with B1 >= test_B1''' + for sigma in range(sigma_0, sigma_0 + num_curves): + order = GroupOrder(param, prime, sigma) + assert 1 <= order < 2 * prime, (prime, param, sigma, order) + + f = factor(order) + assert len(f) >= 1, (order, f) + + for p, k in f: + if p ^ k > test_B1: + break + else: + yield sigma + + +def findPrimesOfSize(count, prime_size): + '''Find count primes with prime_size bits''' + primes = set() + for pi in range(count): + for test in range(100): + r = ZZ(random.randint(2 ^ (prime_size-1), 2 ^ prime_size)) + prime = Primes().next(r) + if prime not in primes: + primes.add(prime) + break + else: + raise ValueError("Can't find enought primes at prime_size=%d" % + prime_size) + return sorted(primes) + + +def expectedFactorsBySigma(args, primes, param, sigma_0, B1): + '''Calculate which primes will be found by which curves''' + factor_by_sigma = {} + for i, prime in enumerate(primes): + # This can be slower than the actual ECM! + sigmas = smallGroupOrders(prime, param, sigma_0, B1, args.gpucurves) + if args.verbose > 2: + print('\t%2d: %20d found @ B1=%d by %s' % (i, prime, B1, sigmas)) + for sigma in sigmas: + if sigma not in factor_by_sigma: + factor_by_sigma[sigma] = 1 + factor_by_sigma[sigma] *= prime + return factor_by_sigma + + +def verifyFactorsFoundBySigma( + args, primes, param, sigma_0, B1, factor_by_sigma): + '''Verify the expected factors where found by ECM''' + + if not any(found for found in factor_by_sigma.values() if found > 1): + raise ValueError( + 'No primes would be found in stage 1, ' + 'lower prime_size or increase B1(%d)' % B1) + + N_log2 = log(prod(primes), 2).n() + assert N_log2 < args.nbits, (args.nbits, N_log2) + + N_str = '*'.join(map(str, primes)) + if args.verbose > 1: + sigma_str = ', '.join(map(str, sorted(factor_by_sigma))) + print('\tSigmas with factors: %s' % sigma_str) + + echo_cmd = 'echo %s | ' % N_str + ecm_cmd = '%s -gpu -gpucurves %d -sigma %d:%d %d 0' % ( + args.ecm_cmd, args.gpucurves, param, sigma_0, B1) + + + if args.verbose > 2: + print('\t' + echo_cmd + ecm_cmd) + elif args.verbose: + print('\t' + ecm_cmd) + + try: + output = subprocess.check_output( + echo_cmd + ecm_cmd, shell=True, universal_newlines=True) + assert False, 'Should have factors and had non-zero return' + except subprocess.CalledProcessError as e: + assert e.returncode in (2, 6, 8, 10, 14), e.returncode + lines = e.output.split('\n') + + found_factors = {} + for line in lines: + match = FACTOR_FOUND_RE.search(line) + if match: + f, sigma = map(int, match.groups()) + assert sigma not in found_factors + found_factors[sigma] = f + + all_sigmas = set(factor_by_sigma.keys()) | set(found_factors.keys()) + for sigma in sorted(all_sigmas): + theory = factor_by_sigma.get(sigma, 1) + practice = found_factors.get(sigma, 1) + if theory != practice: + if theory % practice == 0: + print('sigma=%d Expected to find %d, found %d' % + (sigma, theory, practice)) + elif practice % theory == 0: + extra = practice / theory + f = factor(GroupOrder(param, extra, sigma)) + print('\tExtra factor (%d) found by sigma=%d ' + 'expected order=%s' % (extra, sigma, f)) + else: + print('MAJOR MISMATCH: %d vs %d' % ( + factor(theory), factor(practice))) + + expected_curves = len(factor_by_sigma) + perfect_match = factor_by_sigma == found_factors + if perfect_match: + if args.verbose: + print('Results matched exactly (%d curves found factors)' % + expected_curves) + else: + print('Wrong results for seed=%d' % seed) + print('\t' + echo_cmd + ecm_cmd) + sys.exit(1) + + if args.verbose: + print('') + + return len(found_factors) + + +def stage1Tests(args, prime_size, B1, param, seed): + ''' + Generate N such that many sigmas (sigma_0:sigma_0+args.gpucurves) have factors + Verify sigmas found factor in stage 1. + ''' + assert param == GPU_PARAM, ('GPU only supports param=%d' % GPU_PARAM) + assert prime_size < args.nbits + assert args.nbits <= 1020 + assert prime_size > 20 + + prime_count = args.nbits // prime_size + if args.verbose: + print('Testing GPU stage1: N = %d x %d bits primes @ B1=%d ' % ( + prime_count, prime_size, B1)) + + if args.verbose > 1: + print('\tusing seed: %s' % seed) + + random.seed(seed) + sigma_0 = random.randrange(1000, 2^31) + + primes = findPrimesOfSize(prime_count, prime_size) + factor_by_sigma = expectedFactorsBySigma(args, primes, param, sigma_0, B1) + + return verifyFactorsFoundBySigma( + args, primes, param, sigma_0, B1, factor_by_sigma) + + +def overflowTest(args, param, seed): + ''' + Generate N such that N is VERY close to nbits + Verify small factors found + ''' + + random.seed(seed) + sigma_0 = random.randrange(1000, 2^31) + + # Multiply a handful of small primes that "should" be found by each sigma + # Then pad out N with a giant prime + primes = findPrimesOfSize(count=256//12, prime_size=12) + expected = prod(primes) + + pad_limit = 2 ** args.nbits// expected + large_prime = Primes().next(ZZ(random.randint(pad_limit // 2, pad_limit))) + + N_log2 = log(expected * large_prime, 2).n() + # within 1 bits of the limit + assert 0 < args.nbits - N_log2 < 1, (args.nbits, N_log2) + if args.verbose: + print ("Checking overflow with log2(N) = %.2f" % N_log2) + + # Expect all the small primes to be found by all sigmas + factor_by_sigma = {} + for sigma in range(sigma_0, sigma_0 + args.gpucurves): + factor_by_sigma[sigma] = expected + + return verifyFactorsFoundBySigma( + args, primes + [large_prime], param, sigma_0, args.B1, factor_by_sigma) + + +def timingTest(args, N_sizes): + '''Produce some timing information on ecm''' + primes = [Primes().next(ZZ(int(2 ** (n - 0.5)))) for n in N_sizes] + + # Doesn't matter + sigma = "%d:%d" % (GPU_PARAM, 12) + + for size, prime in zip(N_sizes, primes): + cmd = 'echo %s | %s -gpu -gpucurves %d -sigma %s %d 0' % ( + prime, args.ecm_cmd, args.gpucurves, sigma, args.B1) + try: + output = subprocess.check_output(cmd, shell=True, universal_newlines=True) + timing = min(line for line in output.split('\n') if line.startswith("Computing ")) + timing = timing[timing.index("took") + 4:].strip() + except subprocess.CalledProcessError as e: + timing = "error" + debug = 'N=%d bits, B1=%d, curves=%d, ' % (size, args.B1, args.gpucurves) + print (debug, timing) + + +if __name__ == '__main__': + args = parser.parse_args() + + testInternal() + + seed = args.seed + if seed is None: + seed = random.randrange(2 ^ 32) + + if args.timing: + timingTest(args, [250, 500, 1000, 1500, 2000]) + exit() + + # GPU needs 6 bits for carry / temp results + args.nbits -= 6 + overflowTest(args, GPU_PARAM, seed) + + if args.iterations: + found = 0 + for i in range(args.iterations): + # Test smallish primes (40 bits = 12 digit) at B1 (default: 10^4) + found += stage1Tests(args, 40, args.B1, GPU_PARAM, seed) + seed += 1 + + # Test larger primes at 10xB1 + found += stage1Tests(args, 60, 10*args.B1, GPU_PARAM, seed) + seed += 1 + + print('Results matched in %d tests (%d curves found factors)' % + (2*args.iterations, found)) diff -Nru gmp-ecm-7.0.4+ds/check.mpl gmp-ecm-7.0.5+ds/check.mpl --- gmp-ecm-7.0.4+ds/check.mpl 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/check.mpl 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,657 @@ +pm1_stage1 := proc(n, a0, B1) +local p, a, q; + p := 2; + a := a0; + while p <= B1 do + q := 1; + while q*p <= B1 do + q := q*p; + a := Power(a, p) mod n; + od; + p := nextprime(p); + od; + a +end: + +Powering["P-1"] := proc(a,i,p) Power(a,i) mod p end: +Select["P-1"] := proc(i) evalb(i mod 6=1) end: +Select["P+1"] := proc(i) member(i mod 6,{1,5}) end: + +# stage2, method="P-1" or "P+1" +# p is the modulus +stage2 := proc(p, d, a0, k, method) +local x, F, inva, i, v, t, u, a, dF, G, j, H, ij; + a := a0; + F := 1; + dF := numtheory[phi](d) / 2; + if method = "P-1" then inva := 1/a mod p fi; + for i from 1 to d by 2 do + if Select[method](i) and igcd(i,d)=1 then + v := Powering[method](a,i,p); + if method="P-1" then v := v + Powering[method](inva,i,p) fi; + F := F * (x + v) + fi + od; + F := Expand(F) mod p; + + a := Powering[method](a, d, p); + t := 1; + if method="P-1" then + inva := Powering[method](inva, d, p); + u := 1; + fi; + + lprint("B2=", k*(dF-1)*d); + + ij := 0; + for i to k do + G := 1; + for j from 1 to dF - 1 do + ij := ij + 1; + t := Powering[method](a, ij, p); + v := t; + if method="P-1" then + u := Powering[method](inva, ij, p); + v := v + u; + fi; + G := G * (x + v); + od; + G := Expand(G) mod p; + if i = 1 then H := G + else + G := Expand(G * H) mod p; + H := Rem(G, F, x) mod p; + fi; + od; + G := Gcd(F, H) mod p; + if degree(G)<>0 then lprint("****** Found factor in stage 2: ", p) fi; +end: + +list_mul_opt := proc(n) option remember; + if n<=1 then n + elif member(n, {2,5,6,7,8,17,18,23,24,29,30}) then # Karatsuba + 2*procname(ceil(n/2))+procname(floor(n/2)) + elif member(n, {3,9,10,11,12,20,21,25,26,27}) then # toomcook3 + 4*procname(ceil(n/3))+procname(n-2*ceil(n/3)) + else # toomcook4 + 6*procname(ceil(n/4))+procname(n-3*ceil(n/4)) + fi +end: + +# number of scalar multiplies from karatsuba +karatsuba := proc(n) option remember; + if n<=1 then n + else 2*procname(ceil(n/2))+procname(floor(n/2)) + fi +end: + +# number of scalar multiplies from karatsuba, short product +karatsuba_short := proc(n) option remember; + if n<=1 then n + else procname(ceil(n/2))+2*procname(floor(n/2)) + fi +end: + +# number of scalar multiplies from Karatsuba +# Mulders' short product (optimal cutoff) +karatsuba_short_mulders := proc(n) option remember; local p, m; + if n<=1 then n + else + m := infinity; + for p from ceil(n/2) to n-1 do + m := min(m, karatsuba(p)+2*procname(n-p)) + od; + m + fi +end: + +# get m terms, with entries of n terms +karatsuba_short2 := proc(n0, m0) option remember; local n, m; + n := n0; + m := m0; + if m>2*n-1 then m:=2*n-1 + elif m2*n-1 then m:=2*n-1 + elif mn+1, m->m+2 + c2 := toomcook3_short2(ceil((n+1)/3)-1, ceil((m-4)/3)) # eval. at 0 + + 3*toomcook3_short2(ceil((n+1)/3), ceil((m+1)/3)) # eval. at t=1, -1, 2 + + toomcook3_short2(ceil((n-1)/3), ceil((m+1)/3)); # t = inf + c1 := min(c1, c2); + if karatsuba_short2(n,m) < c1 then + lprint("karatsuba_short2 faster for ", n, m) + fi; + c1 + fi +end: + +toomcook3_2 := proc(n) option remember; local l0, l1, l2; + if n<=2 or n=4 then karatsuba(n) + else + l2 := iquo(n + 2, 3); + l1 := iquo(n + 1, 3); + l0 := n - l2 - l1; + 3*procname(l2) + procname(l0) + procname(l1) + fi +end: + +# memory used by toomcook3 +M := proc(len) local l; +option remember; + l := iquo(len + 2, 3); + 4 * l + max (M(l), 1) +end: +M(0):=0: +M(1):=0: +M(2):=1: +M(4):=5: + +# number of scalar multiplies from toomcook4 +toomcook4 := proc(n) option remember; local l, k; + if member(n,{0,1,2,3,5,6,9,17,18,25,26,27,77,78,79,80,81}) then toomcook3(n) + else + l := iquo(n + 3, 4); + k := n - 3 * l; + 6*procname(l) + procname(k) + fi +end: + +# find optimal method between kara, toom3 and toom4 +find_opt := proc(nmax) local n, T, kara, toom3, toom4; + T[0]:=0; + T[1]:=1; + for n from 2 to nmax do + kara := 2*T[ceil(n/2)]+T[floor(n/2)]; + toom3 := 4*T[ceil(n/3)]+T[n-2*ceil(n/3)]; + if n>=4 and n<>5 then + toom4 := 6*T[ceil(n/4)]+T[n-3*ceil(n/4)] + else + toom4 := kara; + fi; + if kara<=min(toom3,toom4) then lprint(n, "karatsuba", kara); T[n]:=kara + elif toom3<=toom4 then lprint(n, "toomcook3", toom3); T[n]:=toom3 + else T[n]:=toom4 + fi + od; +end: + +# number of scalar multiplies for Toom3 +# Mulders' short product (optimal cutoff) +toomcook4_short_mulders := proc(n) option remember; local p, m, c, s; + if n<=1 then n + else + m := infinity; + for p from ceil(n/2) to n do + c := toomcook4(p)+2*procname(n-p); + if c=l +list_mul := proc(k, l) + if k=l then toomcook4(l) + elif k=l+1 then toomcook4(l) + l # special important case + else toomcook4(l) + list_mul(max(k-l, l), min(k-l, l)) + fi +end: + +# number of multiplies of PolyFromRoots +PolyFromRoots := proc(k) option remember; local l, m; + if k<=1 then 0 + elif k=2 then 1 + else + m := iquo(k, 2); + l := k - m; # l = k or l = k + 1 + procname(l) + procname (m) + list_mul(l, m) + fi +end: + +# PolyFromRoots with optimal cutoff point +# (depends on the multiplication used, and the way list_mul deals with +# operands of different degree) +PolyFromRoots_opt := proc(k) option remember; local l, m, s, c, cmin; + if k<=1 then 0 + elif k=2 then 1 + else + cmin := infinity; + for m from 1 to iquo(k,2) do + l := k - m; + c := procname(l) + procname (m) + list_mul(l, m); + if c 1 then v := v + list_mul (l-1, k-1) fi; + v + fi +end: + +PolyEval := proc(k) option remember; local m, l, v; + if k=1 then 0 + else + m := iquo(k, 2); + l := k - m; + v := RecursiveDivision(m); + if k > 2*m then v := v + m fi; + v + RecursiveDivision(l) + procname(l) + procname(m) + fi +end: + +# output list with increasing phi(d) and decreasing step2_cost(d) +gen_bestD := proc(d0, d1) local l, d, c, p, i, j; + l := [[d0,numtheory[phi](d0),step2_cost(d0)]]; + for d from d0+6 by 6 to d1 do + p := numtheory[phi](d); + c := step2_cost(d); + for i to nops(l) while p > l[i][2] and c < l[i][3] do od; + # now i > nops(l) or phi(d) <= phi(l[i]) or c >= step2_cost(l[i]) + if i > nops(l) then l:=[op(l),[d,p,c]] + elif p <= l[i][2] then # p <= l[j][2] for j >= i + for j from i to nops(l) while c <= l[j][3] do od; + l:=[op(1..i-1, l), [d,p,c], op(j..nops(l), l)] + else # p > l[i][2] and c >= step2_cost(l[i]) + fi + od; + l +end: + +# estimate number of multiplies of PolyGcd(F, G) +# with deg(F)=n and deg(G)=n-1 +PolyGcd := proc(n) + if n<=1 then 0 # gcd is G + else HalfGcd(n,0) + PolyGcd(ceil(n/2)) + fi +end: + +# deg(F)=n and deg(G)=n-1, assumes return cofactors +# and matrix if flag=1 +HalfGcd := proc(n,flag) option remember; local k, l, c; + if n<=1 then 0 + else + k := ceil(n/2); + l := ceil(n/4); + c := procname(k, 1) # return 2x2 matrix with degrees n/4 + + 8*toomcook4(l) # 4 multiplies of n/2 * n/4 + + procname(k, flag) # 2nd call + + 4*toomcook4(l); # 4 multiplies of n/4 * n/4 + if flag=1 then c:=c+7*toomcook4(l) fi; # multiply matrices + c + fi +end: + +# auxiliary memory needed for karatsuba +M := proc(K) option remember; local l; + if K=1 then 0 + else + l := iquo(K+1, 2); + max(2*l-1+l,2*l-2+M(l)) + fi; +end: + +# cf Williams, Math. of Comp. 39, 1982 +# pp1_stage1(328006342451, 5, 7043); # 2^235+1 +# pp1_stage1(6215074747201, 5, 199729); # 2^297+1 +# pp1_stage1(8857714771093, 3, 49207); # 2^418+1 +# pp1_stage1(236344687097, 3, 55001); # 2^602+1 +# pp1_stage1(87251820842149, 5, 170249); # 2^642+1 +# pp1_stage1(719571227339189, 4, 57679); # 3^134+1 +# pp1_stage1(5468575720021, 6, 175759); # 3^161+1 +# pp1_stage1(49804972211, 5, 268757); # 3^185-1 +# pp1_stage1(329573417220613, 3, 101573); # 5^94+1 +# pp1_stage1(4866979762781, 4, 97609); # 6^59-1 +# pp1_stage1(187333846633, 3, 9851); # 6^111-1 +# pp1_stage1(332526664667473, 3, 111919); # 6^132+1 +# pp1_stage1(265043186297, 3, 152791); # 7^133+1 +# pp1_stage1(207734163253, 3, 4211); # 7^231+1 +# pp1_stage1(225974065503889, 5, 8243); # 10^102+1 +# pp1_stage1(660198074631409, 5, 115679); # 12^81-1 +# pp1_stage1(563215815517, 3, 109849); # 12^183+1 +# pp1_stage1(409100738617, 3, 70957); # fib(247) +# pp1_stage1(7901346123803597, 3, 18307); # fib(313) +# pp1_stage1(5648966761, 15, 100519); # fib(367) +# pp1_stage1(14279673833, 3, 823); # fib(387) +# pp1_stage1(1795220677069, 6, 159931); # fib(483) +# pp1_stage1(1250839826281, 5, 4673); # fib(495) +# pp1_stage1(2192843129417, 3, 38803); # fib(531) +# pp1_stage1(10424083697, 3, 131); # fib(549) +pp1_stage1 := proc(n, A0, B1) local a0, p, a, q; + # suggested default choice from Montgomery + if A0=0 then a0:=2/7 mod n else a0:=A0 fi; + if igcd(a0^2-4, n)<>1 then ERROR("igcd(a0^2-4, n)<>1") fi; + if isprime(n) and numtheory[jacobi](a0^2-4, n)=1 then + lprint("Warning: jacobi(a0^2-4, n) = 1") + fi; + p := 2; + a := a0; + while p <= B1 do + q := p; + while q*p <= B1 do q:=q*p od; + a := Powering["P+1"](a, q, n); + p := nextprime(p); + od; + p:=igcd(a-2, n); + if p<>1 then lprint("****** Found factor in stage 1: ", p) fi; + a +end: + +# compute P0 "power" p mod n for P+1 +Powering["P+1"] := proc(P0, p, n) local l, i, P, Q, R; + l := convert(p-1, base, 2); + P := P0; + Q := 2; + for i from nops(l) by -1 to 1 do + if l[i] = 1 then # (i,i-1) to (2i,2i-1) + Q := P*Q-P0 mod n; + P := P^2-2 mod n; + else # (i,i-1) to (2i-1,2i-2) + P := P*Q-P0 mod n; + Q := Q^2-2 mod n; + fi + od; + P +end: + +############################ ecm ################################### + +# converts (x:1:1) to Weierstrass form (mod p) +# returns [X,Y,A] +montgomery_to_weierstrass := proc(x, a, p) local g; + g := (x^3 + a*x^2 + x) mod p; + [(3*x+a)/(3*g) mod p, 1/g mod p, (3-a^2)/(3*g^2) mod p] +end: + +# return 2*(x1:z1) +dup := proc(x1, z1, b, n) local u, v, w, x2, z2; + u := x1+z1 mod n; + u := u^2 mod n; + v := x1-z1 mod n; + v := v^2 mod n; + x2 := u*v mod n; + w := u-v mod n; + u := w*b mod n; + u := u+v mod n; + z2 := w*u mod n; + return x2, z2; +end: + +# return (x1:z1) + (x2:z2), where (x:z) = (x1:z1) - (x2:z2) +add3 := proc(x1, z1, x2, z2, x, z, n) local u, v, w, x3, z3; + u := x2-z2 mod n; + v := x1+z1 mod n; + u := u*v mod n; + w := x2+z2 mod n; + v := x1-z1 mod n; + v := w*v mod n; + w := u+v mod n; + v := u-v mod n; + w := w*w mod n; + v := v*v mod n; + x3 := w*z mod n; + z3 := x*v mod n; + return x3, z3; +end: + +addW := proc(x1, y1, x2, y2, n) local u, v, p; + u := x2-x1; + v := 1/u mod n; # 1/(x2-x1) + p := (y2-y1)*v mod n; # lambda=(y2-y1)/(x2-x1) + u := p*p-x1; # lambda^2-x1 + v := u-x2 mod n; # lambda^2-x1-x2 + u := (x1-v)*p; # (2x1+x2-lambda^2)*lambda + [v, u-y1 mod n] +end: + +# (x::y) -> 2*(x::y) +dupW := proc(x, y, n, a) local u, v, p; + v := 1/(2*y) mod n; + u := 3*x^2+a mod n; + p := u*v mod n; + u := p^2; + v := 2*x; + u := u-v mod n; + [u, (x-u)*p-y mod n] +end: + +# (x::y) -> k*(x::y) +mulW := proc(x, y, k, n, a) local l, P, i; + if k=1 then [x,y] + else # k >= 3 + l := convert(k, base, 2); + P := [x, y]; + for i from nops(l)-1 by -1 to 1 do + P := dupW(op(P), n, a); + if l[i]=1 then P := addW(op(P), x, y, n) fi + od; + P + fi +end: + +############################################################################## + +# odd-even variant +kara_short_mul := proc(a, b, n) +local a0, a1, b0, b1, c0, c1, c2, p, q, r, i, res; + if n = 0 then [] + elif n = 1 then [a[1]*b[1]] + else + p := ceil(n/2); + q := ceil((n-1)/2); + r := q; + a0 := [seq(a[2*i-1],i=1..p)]; + b0 := [seq(b[2*i-1],i=1..p)]; + a1 := [seq(a[2*i], i=1..q)]; + b1 := [seq(b[2*i], i=1..q)]; + c0 := procname(a0, b0, p); + c1 := procname(a0[1..q]+a1, b0[1..q]+b1, q); + c2 := procname(a1, b1, r); + c1 := c1 - c0[1..q] - [op(c2),0$(q-r)]; + res := [0$n]; + for i to p do res[2*i-1]:=c0[i] od; + for i to min(r,iquo(n-1,2)) do res[2*i+1]:=res[2*i+1]+c2[i] od; + for i to q do res[2*i]:=c1[i] od; + res + fi +end: + +# odd-even variant +toom3_short_mul := proc(a, b, n) +local a0, a1, a2, b0, b1, b2, c0, c1, c2, c3, c4, p, q, r, i, res; + if n = 0 then [] + elif n = 1 then [a[1]*b[1]] + elif n = 2 then [a[1]*b[1], a[1]*b[2]+a[2]*b[1]] + else + p := ceil(n/3); + q := ceil((n-1)/3); + r := ceil((n-2)/3); + a0 := [seq(a[3*i-2],i=1..p)]; + b0 := [seq(b[3*i-2],i=1..p)]; + a1 := [seq(a[3*i-1], i=1..q)]; + b1 := [seq(b[3*i-1], i=1..q)]; + a2 := [seq(a[3*i], i=1..r), 0$(q-r)]; + b2 := [seq(b[3*i], i=1..r), 0$(q-r)]; + c0 := procname(a0, b0, p); # 0 + c1 := procname(a0[1..q]+a1+a2, b0[1..q]+b1+b2, q); # 1 + c2 := procname(a0[1..q]-a1+a2, b0[1..q]-b1+b2, q); # -1 + c3 := procname(a0[1..q]+2*a1+4*a2, b0[1..q]+2*b1+4*b2, q); # 2 + c4 := procname(a2, b2, q); + c1 := c1 - c0[1..q] - c4; # d1+d2+d3 + c2 := c2 - c0[1..q] - c4; # -d1+d2-d3 + c3 := c3 - c0[1..q] - 16*c4; # 2*d1+4*d2+8*d3 + c1 := (c1 + c2)/2; # d2 + c2 := c2 - c1; # -d1-d3 + c3 := c3 - 4*c1; # 2*d1+8*d3 + c3 := c3 + 2*c2; # 6*d3 + c3 := c3/6; # d3 + c2 := -c2-c3; # d1 + res := [0$n]; + for i to p do res[3*i-2]:=c0[i] od; + for i to q do res[3*i-1]:=res[3*i-1]+c2[i] od; + for i to ceil((n-2)/3) do res[3*i]:=res[3*i]+c1[i] od; + for i to ceil((n-3)/3) do res[3*i+1]:=res[3*i+1]+c3[i] od; + for i to ceil((n-4)/3) do res[3*i+2]:=res[3*i+2]+c4[i] od; + res + fi +end: diff -Nru gmp-ecm-7.0.4+ds/checkprob gmp-ecm-7.0.5+ds/checkprob --- gmp-ecm-7.0.4+ds/checkprob 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/checkprob 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,21 @@ +#!/bin/csh +# Example: checkprob "./ecm -param 1" 31622776601683800097 11000 +set ecm="$1" +set p=$2 +set B1=$3 +set out=/tmp/log$$ +@ try = 0 +@ tot = 0 +while (1) + @ try = $try + 1 + echo $p | $ecm -one -c 0 $B1 > $out + grep Run $out > /dev/null + if ("$status" != "0") then # found in first curve + @ n = 1 + else + @ n =`grep Run $out | tail -1 | cut -d" " -f 2` + endif + @ tot = $tot + $n + @ avg = $tot / $try + echo $avg +end diff -Nru gmp-ecm-7.0.4+ds/check.sage gmp-ecm-7.0.5+ds/check.sage --- gmp-ecm-7.0.4+ds/check.sage 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/check.sage 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,114 @@ +def FindGroupOrder(p,s): + K = GF(p) + v = K(4*s) + u = K(s^2-5) + x = u^3 + b = 4*x*v + a = (v-u)^3*(3*u+v) + A = a/b-2 + x = x/v^3 + b = x^3 + A*x^2 + x + E = EllipticCurve(K,[0,b*A,0,b^2,0]) + return factor(E.cardinality()) + +def FindGroupOrderA(p,A): + K = GF(p) + d = K((A+2)/4) + a = K(4*d-2) + b = K(16*d+2) + E = EllipticCurve(K,[0,a/b,0,1/b^2,0]) + return factor(E.cardinality()) + +# for parameter sigma = 1:s +def FindGroupOrderParam1(p,s): + return FindGroupOrderA (p, 4*s^2/2^64-2) + +# for parameter sigma = 2:s +def FindGroupOrderParam2(p,s): + K = GF(p) + E = EllipticCurve(K,[0,36]) + P = s*E(-3,3) + x,y = P.xy() + x3 = (3*x+y+6)/(2*(y-3)) + A = -(3*x3^4+6*x3^2-1)/(4*x3^3) + return FindGroupOrderA(p, A) + +# for parameter sigma = 3:s +def FindGroupOrderParam3(p,s): + return FindGroupOrderA (p, 4*s/2^32-2) + +def FindGroupOrderParam (p, sigma, param): + if param == 0: + return FindGroupOrder (p, sigma) + elif param == 1: + return FindGroupOrderParam1 (p, sigma) + elif param == 2: + return FindGroupOrderParam2 (p, sigma) + elif param == 3: + return FindGroupOrderParam3 (p, sigma) + else: + print ("Invalid parametrization: ", param) + raise ValueError + +# check if the prime p is found with B1,B2,param,sigma, or raises an error +# check_found_aux ("./ecm", 31622776601683800097, 11000, 1873422, 1, 800667805) +# check_found_aux ("./ecm", 31622776601683800097, 11000, 1873422, 1, 800667806) +def check_found_aux (ecm, p, B1, B2, param, sigma): + f = open("/tmp/inxyz", "w") + f.write(str(p) + "\n") + f.close() + f = open("/tmp/doitxyz", "w") + f.write(ecm + " -param " + str(param) + " -sigma " + str(sigma) + " " + str(B1) + " " + str(B2) + " < /tmp/inxyz > /tmp/outxyz\n") + f.close() + os.system("chmod +x /tmp/doitxyz") + os.system("/tmp/doitxyz") + f = open("/tmp/outxyz", "r") + l = f.readlines() + f.close() + n = len(l) + if l[n-1] != 'Found input number N\n': + print ("prime p=", p, "not found with B1=", B1, "B2=", B2, "param=", param, "sigma=", sigma) + raise ValueError + +def is_found(l, B1, B2): + n = len(l) + if l[n-1][0] > B2: + return False + for i in range(n-2,-1,-1): + if l[i][0]^l[i][1] > B1: + return False + return True + +# check if a prime p is found with bounds B1 and B2, +# for parameter 'param' and sigma in [sigma_min,sigma_max-1] +# check_found ("./ecm", 31622776601683800097, 11000, 1873422, 0, 1000) +# check_found ("./ecm", 31622776601683800097, 11000, 1873422, 1, 1000) +# check_found ("./ecm", 31622776601683800097, 11000, 1873422, 2, 1000) +# check_found ("./ecm", 31622776601683800097, 11000, 1873422, 3, 1000) +def check_found (ecm, p, B1, B2, param, sigma_max): + assert (is_prime (p)) + e2 = 0 + e3 = 0 + tries = 0 + found = 0 + for sigma in range(sigma_max): + try: + l = FindGroupOrderParam (p, sigma, param) + except ArithmeticError: + continue + tries += 1 + assert (l[0][0] == 2) + e2 += l[0][1] + if l[1][0] == 3: + e3 += l[1][1] + if is_found (l, B1, B2): + # check the factor is really found + check_found_aux (ecm, p, B1, B2, param, sigma) + found += 1 + print (tries, found, 1.0*e2/tries, 1.0*e3/tries, 2.0^(e2/tries)*3.0^(e3/tries)) + +# check all parametrizations 0, 1, 2, 3 +# check_found_all ("./ecm", 31622776601683800097, 11000, 1873422, 1000) +def check_found_all (ecm, p, B1, B2, sigma_max): + for param in range(4): + check_found (ecm, p, B1, B2, param, sigma_max) diff -Nru gmp-ecm-7.0.4+ds/configure.ac gmp-ecm-7.0.5+ds/configure.ac --- gmp-ecm-7.0.4+ds/configure.ac 2016-10-11 09:23:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/configure.ac 2022-06-06 14:16:49.000000000 +0000 @@ -1,7 +1,7 @@ -m4_define([ECM_VERSION_AC], [7.0.4]) +m4_define([ECM_VERSION_AC], [7.0.5]) AC_PREREQ([2.57]) -AC_INIT([ecm], ECM_VERSION_AC, [ecm-discuss@lists.gforge.inria.fr]) +AC_INIT([ecm], ECM_VERSION_AC, [ecm-discuss@inria.fr]) AC_CONFIG_MACRO_DIR([m4]) GMP_INIT([config.m4]) @@ -10,12 +10,23 @@ AC_CONFIG_HEADERS([config.h ecm.h]) AC_DEFINE([ECM_VERSION], ["ECM_VERSION_AC"], [The version of GMP-ECM]) +AS_CASE(ECM_VERSION_AC, + [*-dev], AC_DEFINE([IS_DEV_BUILD], [1], [Define to 1 in DEV builds])) -dnl Copied from MPFR 2.4.2: -unset GMP_CFLAGS GMP_CC user_redefine_cc +dnl Originally copied from MPFR 2.4.2: +unset GMP_CFLAGS GMP_CC user_redefine_cc user_redefine_cflags dnl Check if user request his CC and CFLAGS -if test -n "$CFLAGS" || test -n "$CC" ; then - user_redefine_cc=yes +dnl Check is disabled if cc is a gcc variant +dnl Because this happens before AC_PROG_CC $GCC isn't set +if test -n "$CFLAGS"; then + user_redefine_cflags=yes +fi +if test -n "$CC" ; then + case $CC in + gcc*) check_use_defined_cc_is_gcc=yes ;; + clang*) check_use_defined_cc_is_gcc=yes ;; + *) user_redefine_cc=yes ;; + esac fi dnl the following is required to compile auxi.c according to autoconf 2.61 @@ -98,7 +109,7 @@ dnl would have incorrect settings. dnl FIXME: Move this in aclocal ? -if test "x$user_redefine_cc" = x && test "x$enable_gmp_cflags" = xyes && test "x$cross_compiling" != xyes; then +if test "x$user_redefine_cc$user_redefine_cflags" = x && test "x$enable_gmp_cflags" = xyes && test "x$cross_compiling" != xyes; then if test "x$GMP_CC$GMP_CFLAGS" = x; then AC_MSG_CHECKING([for CC and CFLAGS in gmp.h]) GMP_CC=__GMP_CC @@ -142,6 +153,11 @@ dnl But these variables may be invalid, so we must check them first. dnl Note: we do not use AC_RUN_IFELSE, as it implies AC_PROG_CC. if test "x$GMP_CC$GMP_CFLAGS" != x; then + if test "x$CC" != x && test "x$CC" != "x$GMP_CC" ; then + AC_MSG_NOTICE([overriding GMP_CC=$GMP_CC with CC=$CC]) + GMP_CC=$CC + fi + AC_MSG_CHECKING([whether CC=$GMP_CC and CFLAGS=$GMP_CFLAGS works]) AC_LANG_CONFTEST([AC_LANG_SOURCE([int main (void) { return 0; }])]) if $GMP_CC $GMP_CFLAGS -o conftest conftest.$ac_ext 2> /dev/null ; then @@ -158,10 +174,14 @@ dnl Checks for programs. -AC_PROG_CC +AC_PROG_CC_C99 AM_PROG_AS AM_PROG_CC_C_O +dnl If CC was user defined but looked like gcc, check that AC agrees. +if test "x$check_use_defined_cc_is_gcc" != "x" && test "x$GCC" != "xyes"; then + AC_MSG_ERROR([User specified CC=$CC looked like gcc but wasn't]) +fi # Now that we have decided on CC and CFLAGS, init libtool # Don't make a shared library by default. Enable building a shared library @@ -272,7 +292,7 @@ GMP_ASM_TYPE case $host in - *-*-mingw32|*cygwin*) GMP_DEFINE([WINDOWS64_ABI], 1) + *-*-mingw*|*cygwin*) GMP_DEFINE([WINDOWS64_ABI], 1) AC_DEFINE([WINDOWS64_ABI], 1,[Define to 1 if x86_64 mulredc*() functions should be called with Windows ABI]);; *) ;; esac @@ -405,7 +425,7 @@ dnl If we use GCC and user has not specified his own CFLAGS, dnl add some warning flags, avoiding duplication -if test "x$GCC" = xyes && test "x$user_redefine_cc" != xyes; then +if test "x$GCC" = xyes && test "x$user_redefine_cflags" != xyes; then case $CFLAGS in "-pedantic "* | *" -pedantic "* | *" -pedantic") ;; *) CFLAGS="-pedantic $CFLAGS" @@ -588,9 +608,38 @@ AC_MSG_RESULT(DLL) AC_MSG_ERROR([gmp.h is a DLL: use --disable-static --enable-shared]) ]) fi + + AC_MSG_CHECKING([whether we can link without -lpsapi]) + # AC_SEARCH_LIBS won't work here, so do it by hand... + AC_DEFUN([ECM_PSAPI_PROGRAM], + [AC_LANG_PROGRAM([[ + #ifdef _WIN32 + #include + #include + int testfunc () + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); + return (int) (info.PeakWorkingSetSize >> 10); + } + #endif + ]])]) + AC_LINK_IFELSE([ECM_PSAPI_PROGRAM], + [AC_MSG_RESULT(yes)], + [AC_MSG_RESULT(no) + try_link_psapi=yes + ]) + if test x"$try_link_psapi" = "xyes"; then + AC_MSG_CHECKING([whether we can link with -lpsapi]) + LIBS="-lpsapi $LIBS" + AC_LINK_IFELSE([ECM_PSAPI_PROGRAM], + [AC_MSG_RESULT(yes)], + [AC_MSG_RESULT(no) + AC_MSG_ERROR([memusage.c using unresolved GetProcessMemoryInfo]) + ]) + fi ;; esac -CL_AS_NOEXECSTACK AC_SUBST([LIBECM_LDFLAGS]) @@ -651,7 +700,6 @@ AC_SUBST([XSLDIR]) AC_SUBST([ASMPATH]) -AC_SUBST([GSL_LD_FLAGS]) COV_FRAG=' # These options are used for coverage tests diff -Nru gmp-ecm-7.0.4+ds/coverage_gpu.sh gmp-ecm-7.0.5+ds/coverage_gpu.sh --- gmp-ecm-7.0.4+ds/coverage_gpu.sh 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/coverage_gpu.sh 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,26 @@ +#!/bin/bash +# script to perform a coverage test, with gpu support + +t=$1 # target directory for the html files +d=`mktemp -d /tmp/ecmXXX` +cd $d +git clone https://gitlab.inria.fr/zimmerma/ecm.git +cd ecm +autoreconf -i +# ./configure --enable-gpu=sm_30 --with-gmp=/users/caramel/logiciels/gmp-6.0.0/core2/ --with-cuda=/usr/local/cuda-5.0.old/ --with-cc-for-cuda=/users/caramel/logiciels/gcc-4.3.6/x86_64/bin/ +# ./configure --enable-gpu=sm_30 --with-cuda=/tmp/cuda +./configure --enable-gpu --disable-shared --enable-static +# make CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" +make +export LD_LIBRARY_PATH=/usr/lib/cuda/lib64:. +./test.gpuecm ./ecm +make longcheck VALGRIND= +make bench_mulredc +./bench_mulredc +make tune +./tune -v +# geninfo --no-checksum --ignore-errors gcov,source -q --output-filename ecm.info ./ --no-external +# rm -rf $t +# genhtml -o $t/ ecm.info +cd +rm -rf $d diff -Nru gmp-ecm-7.0.4+ds/coverage.sh gmp-ecm-7.0.5+ds/coverage.sh --- gmp-ecm-7.0.4+ds/coverage.sh 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/coverage.sh 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,48 @@ +#!/bin/bash +# script to perform a coverage test +# +# In case of problems, make sure gcov has the same version number as gcc. +# If not, you might want to add the flag +# --gcov-tool +# in geninfo. +# Also you may add CC= to the make command. +# +t=$1 # target directory for the html files +d=`mktemp -d /tmp/ecmXXX` +cd $d +git clone https://gitlab.inria.fr/zimmerma/ecm +cd ecm +autoreconf -i +./configure --disable-assert +echo "Testing PARAMS00" +make CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" PARAMS00=1 +./test.ecm ./ecm +geninfo --no-checksum --ignore-errors gcov,source -q --output-filename ecm00.info ./ --no-external +make clean +echo "Testing PARAMS11" +make CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" PARAMS11=1 +./test.ecm ./ecm +geninfo --no-checksum --ignore-errors gcov,source -q --output-filename ecm11.info ./ --no-external +make clean +echo "Testing PARAMS22" +make CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" PARAMS22=1 +./test.ecm ./ecm +geninfo --no-checksum --ignore-errors gcov,source -q --output-filename ecm22.info ./ --no-external +make clean +echo "Testing PARAMS33" +make CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" PARAMS33=1 +./test.ecm ./ecm +geninfo --no-checksum --ignore-errors gcov,source -q --output-filename ecm33.info ./ --no-external +make clean +make CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" +echo "Testing longcheck" +make longcheck VALGRIND= +echo "Testing bench_mulredc" +./bench_mulredc -v +echo "Testing tune" +./tune -v +geninfo --no-checksum --ignore-errors gcov,source -q --output-filename ecm.info ./ --no-external +rm -rf $t +genhtml -o $t/ ecm.info ecm00.info ecm11.info ecm22.info ecm33.info +cd +rm -rf $d diff -Nru gmp-ecm-7.0.4+ds/cudacommon.cu gmp-ecm-7.0.5+ds/cudacommon.cu --- gmp-ecm-7.0.4+ds/cudacommon.cu 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/cudacommon.cu 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,125 @@ +/* When compiling the CUDA code, we do not want to include all ecm-impl.h*/ +#define _DO_NOT_INCLUDE_ECM_IMPL_H + +#include "cudacommon.h" +#include "ecm-gpu.h" + +#include + + +#ifndef __CUDACC__ +#error "This file should only be compiled with nvcc" +#endif + +/* First call to a global function initialize the device */ +__global__ void Cuda_Init_Device () +{ +} + +extern "C" +int +get_device_prop(int device, cudaDeviceProp *deviceProp) +{ + cudaError_t err; + + if (device!=-1) + { + err = cudaSetDevice(device); + if (err != cudaSuccess) + { + fprintf (stderr, "GPU: Error: Could not use device %d\n", device); + fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); + return 0; + } + } + + err = cudaGetDevice (&device); + if (err != cudaSuccess) + { + fprintf (stderr, "GPU: Error: no active device.\n"); + fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); + return 0; + } + + err = cudaGetDeviceProperties (deviceProp, device); + if (err != cudaSuccess) + { + fprintf (stderr, "GPU: Error while getting device's properties.\n"); + fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); + return 0; + } + return 1; +} + +extern "C" +int +select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose, int schedule) +{ + cudaDeviceProp deviceProp; + + if (device!=-1 && verbose) + fprintf (stdout, "GPU: device %d is required.\n", device); + + if (!get_device_prop(device, &deviceProp)) + return -1; + + if (verbose) + { + printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n" + "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d " + "maxRegsPerBlock = %d\n", device, deviceProp.name, + deviceProp.major, deviceProp.minor, + deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock, + deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock); + } + + + if (*number_of_curves == 0) /* if choose the number of curves */ + { + /* Limited by the maximum number of threads per MP */ + unsigned int blocks_per_multiprocessor = 2; + *number_of_curves = blocks_per_multiprocessor * deviceProp.multiProcessorCount + * ECM_GPU_CURVES_BY_BLOCK; + } + else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0) + { + /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */ + *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) * + ECM_GPU_CURVES_BY_BLOCK; + if (verbose) + fprintf(stderr, "GPU: the requested number of curves has been " + "modified to %u\n", *number_of_curves); + } + + /* First call to a global function initialize the device */ + if (schedule == 1) + { + cuda_check (cudaSetDeviceFlags (cudaDeviceScheduleBlockingSync)); + } + else + { + cuda_check (cudaSetDeviceFlags (cudaDeviceScheduleYield)); + } + Cuda_Init_Device<<<1, 1>>> (); + cuda_check (cudaGetLastError()); + + return 0; +} + +void +kernel_info(const void* func, int verbose) +{ + if (verbose) + { + struct cudaFuncAttributes kernelAttr; + cudaError_t err = cudaFuncGetAttributes (&kernelAttr, func); + if (err == cudaSuccess) + printf ("GPU: Using device code targeted for architecture compile_%d\n" + "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n" + "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n", + kernelAttr.binaryVersion, kernelAttr.ptxVersion, + kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs, + kernelAttr.sharedSizeBytes); + } +} + diff -Nru gmp-ecm-7.0.4+ds/cudacommon.h gmp-ecm-7.0.5+ds/cudacommon.h --- gmp-ecm-7.0.4+ds/cudacommon.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/cudacommon.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,41 @@ +#ifndef _CUDACOMMON_H +#define _CUDACOMMON_H 1 + +#include +#include +#include +#ifdef _MSC_VER +#include +#endif + +#ifdef __cplusplus +/* cpp + CUDA only code */ + +#define CUDA_CHECK(action) cuda_check(action, #action, __FILE__, __LINE__) + +inline void cuda_check(cudaError_t status, const char *action=NULL, const char *file=NULL, int32_t line=0) { + if (status != cudaSuccess) { + fprintf (stderr, "CUDA error (%d) occurred: %s\n", status, cudaGetErrorString(status)); + if (action!=NULL) + fprintf (stderr, "While running %s (file %s, line %d)\n", action, file, line); + exit(EXIT_FAILURE); + } +} + + +void kernel_info(const void* func, int verbose); +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +int get_device_prop(int device, struct cudaDeviceProp *deviceProp); +int select_and_init_GPU (int, unsigned int*, int, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _CUDACOMMON_H */ diff -Nru gmp-ecm-7.0.4+ds/cudakernel.cu gmp-ecm-7.0.5+ds/cudakernel.cu --- gmp-ecm-7.0.4+ds/cudakernel.cu 2016-03-07 11:03:23.000000000 +0000 +++ gmp-ecm-7.0.5+ds/cudakernel.cu 2022-06-06 14:16:49.000000000 +0000 @@ -3,6 +3,7 @@ #include "ecm-gpu.h" #include #include "cudakernel.h" +#include "cudacommon.h" #ifndef __CUDACC__ #error "This file should only be compiled with nvcc" @@ -14,146 +15,14 @@ __device__ biguint_t d_Mcst; -#define errCheck(err) cuda_errCheck (err, __FILE__, __LINE__) -#define cudaMalloc(d, size) errCheck (cudaMalloc (d, size)) -#define cudaMemcpyHtoD(d, h, size) errCheck (cudaMemcpy ((void *) d, \ +#define cudaMalloc(d, size) cuda_check (cudaMalloc (d, size)) +#define cudaMemcpyHtoD(d, h, size) cuda_check (cudaMemcpy ((void *) d, \ (void *) h, size, cudaMemcpyHostToDevice)) -#define cudaMemcpyDtoH(h, d, size) errCheck (cudaMemcpy ((void *) h, \ +#define cudaMemcpyDtoH(h, d, size) cuda_check (cudaMemcpy ((void *) h, \ (void *) d, size, cudaMemcpyDeviceToHost)) -#define cudaMemcpyCst(d, h, size) errCheck (cudaMemcpyToSymbol (d, h, size)) +#define cudaMemcpyCst(d, h, size) cuda_check (cudaMemcpyToSymbol (d, h, size)) -/******************************/ -/* Host code handling the GPU */ -/******************************/ - -inline void cuda_errCheck (cudaError err, const char *file, const int line) -{ - if( err != cudaSuccess ) - { - fprintf(stderr, "%s(%i) : Error cuda : %s.\n", - file, line, cudaGetErrorString( err) ); - exit(EXIT_FAILURE); - } -} - -/* First call to a global function initialize the device */ -__global__ void Cuda_Init_Device () -{ -} - -/* Given the compute compatibility (as major.minor), return the number of block - * to be run on one multiprocessor. */ -extern "C" -unsigned int -getNumberOfBlockPerMultiProcessor (int major, int minor) -{ - /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the - * number of available registrer (need 23 registers per threads). - */ - if (major == 2) - return 1; - /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP. - */ - else if (major == 3) - return 2; - /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */ - else if (major == 5) - return 2; - /* We assume that for newer compute capability the properties of the GPU won't - * decrease. - */ - else - return 2; -} - -extern "C" -int -select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose) -{ - cudaDeviceProp deviceProp; - cudaError_t err; - - if (device!=-1) - { - if (verbose) - fprintf (stdout, "GPU: device %d is required.\n", device); - - err = cudaSetDevice(device); - if (err != cudaSuccess) - { - fprintf (stderr, "GPU: Error: Could not use device %d\n", device); - fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); - return -1; - } - } - - err = cudaGetDevice (&device); - if (err != cudaSuccess) - { - fprintf (stderr, "GPU: Error: no active device.\n"); - fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); - return -1; - } - - err = cudaGetDeviceProperties (&deviceProp, device); - if (err != cudaSuccess) - { - fprintf (stderr, "GPU: Error while getting device's properties.\n"); - fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); - return -1; - } - - if (verbose) - { - printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n" - "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d " - "maxRegsPerBlock = %d\n", device, deviceProp.name, - deviceProp.major, deviceProp.minor, - deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock, - deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock); - } - - - if (*number_of_curves == 0) /* if choose the number of curves */ - { - unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK; - n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor); - *number_of_curves = n * deviceProp.multiProcessorCount * m; - } - else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0) - { - /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */ - *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) * - ECM_GPU_CURVES_BY_BLOCK; - if (verbose) - fprintf(stderr, "GPU: the requested number of curves has been " - "modified to %u\n", *number_of_curves); - } - - /* First call to a global function initialize the device */ - errCheck (cudaSetDeviceFlags (cudaDeviceScheduleYield)); - Cuda_Init_Device<<<1, 1>>> (); - errCheck (cudaGetLastError()); - - if (verbose) - { - struct cudaFuncAttributes kernelAttr; - err = cudaFuncGetAttributes (&kernelAttr, Cuda_Ell_DblAdd); - if (err == cudaSuccess) - { - printf ("GPU: Using device code targeted for architecture compile_%d\n" - "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n" - "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n", - kernelAttr.binaryVersion, kernelAttr.ptxVersion, - kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs, - kernelAttr.sharedSizeBytes); - } - } - - return 0; -} - extern "C" float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, biguint_t *h_xarray, biguint_t *h_zarray, @@ -161,6 +30,10 @@ unsigned int firstinvd, unsigned int number_of_curves, int verbose) { + /* Print some debug info about the kernel */ + kernel_info((const void*) Cuda_Ell_DblAdd, verbose); + + cudaEvent_t start, stop; cudaEventCreate (&start); cudaEventCreate (&stop); @@ -191,7 +64,7 @@ /* Create a pair of events to pace ourselves */ for (i=0; i + +#ifdef __cplusplus __global__ void Cuda_Ell_DblAdd (biguint_t *xarg, biguint_t *zarg, biguint_t *x2arg, biguint_t *z2arg, unsigned int firstinvd); +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +int get_device_prop(int device, struct cudaDeviceProp *deviceProp); +int select_and_init_GPU (int, unsigned int*, int, int); +float cuda_Main (biguint_t, biguint_t, biguint_t, digit_t, biguint_t*, + biguint_t*, biguint_t*, biguint_t*, mpz_t, unsigned int, + unsigned int, int); +#ifdef __cplusplus +} +#endif + #endif /* _CUDAKERNEL_H */ diff -Nru gmp-ecm-7.0.4+ds/cudawrapper.c gmp-ecm-7.0.5+ds/cudawrapper.c --- gmp-ecm-7.0.4+ds/cudawrapper.c 2016-07-25 11:06:54.000000000 +0000 +++ gmp-ecm-7.0.5+ds/cudawrapper.c 2022-06-06 14:16:49.000000000 +0000 @@ -2,12 +2,15 @@ #ifdef WITH_GPU -#define TWO32 4294967296 /* 2^32 */ +#include "cudacommon.h" +#include "cudakernel.h" -extern int select_and_init_GPU (int, unsigned int*, int); -extern float cuda_Main (biguint_t, biguint_t, biguint_t, digit_t, biguint_t*, - biguint_t*, biguint_t*, biguint_t*, mpz_t, unsigned int, - unsigned int, int); +#ifdef HAVE_CGBN_H +#include "cgbn_stage1.h" +#endif /* HAVE_CGBN_H */ + + +#define TWO32 4294967296 /* 2^32 */ int findfactor (mpz_t factor, mpz_t N, mpz_t xfin, mpz_t zfin) { @@ -36,6 +39,138 @@ return youpi; } +/* Try to reduce all composite factors to primes. + * This can be hard if factors overlap e.g. (a*b, a*c*d, b*c) + */ +void reducefactors (mpz_t *factors, int *array_found, unsigned int nb_curves) +{ + unsigned int i, j; + unsigned int found; + unsigned int updates; + mpz_t gcd; + mpz_init (gcd); + + found = 0; + mpz_t *reduced = (mpz_t *) malloc (nb_curves * sizeof (mpz_t)); + ASSERT_ALWAYS (reduced != NULL); + + /* Add all unique factors to reduced */ + for (i = 0; i < nb_curves; i++) + { + if (array_found[i] == ECM_NO_FACTOR_FOUND) + continue; + + /* Scan for match */ + updates = 0; + for (j = 0; j < found; j++) { + if (mpz_cmp (factors[i], reduced[j]) == 0) { + updates = 1; + break; + } + } + if (!updates) + mpz_init_set (reduced[found++], factors[i]); + } + + do { + outputf (OUTPUT_DEVVERBOSE, "GPU: Reducing %d factors\n", found); + updates = 0; + + /* remove any trivial factor */ + for (i = 0; i < found; i++) + { + while (mpz_cmp_ui (reduced[i], 1) == 0) { + found--; + mpz_swap (reduced[i], reduced[found]); + mpz_clear (reduced[found]); + if (i == found) + break; + } + } + + for (i = 0; i < found; i++) + { + /* Try to reduce an existing factor */ + for (j = i+1; j < found; j++) + { + /* if i == j remove reduced[j] */ + if (mpz_cmp (reduced[i], reduced[j]) == 0) + { + updates += 1; + found--; + mpz_swap (reduced[j], reduced[found]); + mpz_clear (reduced[found]); + if (j == found) + break; + } + + mpz_gcd (gcd, reduced[i], reduced[j]); + if (mpz_cmp_ui (gcd, 1) > 0) + { + /* gcd(2*3, 2*3*5) remove 2*3 from F2 leaving 2*3 and 5 */ + if (mpz_cmp (gcd, reduced[i]) == 0) + { + updates += 1; + assert( mpz_divisible_p (reduced[j], gcd) ); + mpz_divexact (reduced[j], reduced[j], gcd); + } + /* gcd(2*3*5, 2*3) == 2*3 from F1 leaving 5 and 2*3 */ + else if (mpz_cmp (gcd, reduced[j]) == 0) + { + updates += 1; + assert( mpz_divisible_p (reduced[i], gcd) ); + mpz_divexact (reduced[i], reduced[i], gcd); + } + + /* hard case gcd(2*3, 3*5) = 3, remove 3 from both, add 3 as new factor */ + else if (found < nb_curves) + { + updates += 1; + mpz_divexact (reduced[j], reduced[j], gcd); + mpz_divexact (reduced[i], reduced[i], gcd); + + mpz_init (reduced[found]); + mpz_set (reduced[found], gcd); + found++; + } + } + if (mpz_cmp_ui (reduced[i], 1) == 0) + break; + } + } + } while (updates > 0); + + /* bubble_sort, fast enough because found < num_curves */ + do { + updates = 0; + for (j = 1; j < found; j++) + { + if (mpz_cmp(reduced[j-1], reduced[j]) > 0) + { + updates += 1; + mpz_swap(reduced[j-1], reduced[j]); + } + } + } while (updates > 0); + + outputf (OUTPUT_DEVVERBOSE, "GPU: Reduced to %d factors\n", found); + /* write out reduced[i], update array_found */ + for (i = 0; i < found; i++) + { + mpz_swap(factors[i], reduced[i]); + mpz_clear(reduced[i]); + array_found[i] = ECM_FACTOR_FOUND_STEP1; + outputf (OUTPUT_DEVVERBOSE, "GPU: Reduced factor %d: %Zd\n", i+1, factors[i]); + } + + for (i = found; i < nb_curves; i++) + array_found[i] = ECM_NO_FACTOR_FOUND; + + mpz_clear (gcd); + free(reduced); +} + + void to_mont_repr (mpz_t x, mpz_t n) { mpz_mul_2exp (x, x, ECM_GPU_MAX_BITS); @@ -78,7 +213,29 @@ } } -int gpu_ecm_stage1 (mpz_t *factors, int *array_stage_found, mpz_t N, mpz_t s, +static void +A_from_sigma (mpz_t A, unsigned int sigma, mpz_t n) +{ + mpz_t tmp; + int i; + mpz_init_set_ui (tmp, sigma); + /* Compute d = sigma/2^ECM_GPU_SIZE_DIGIT */ + for (i = 0; i < ECM_GPU_SIZE_DIGIT; i++) + { + if (mpz_tstbit (tmp, 0) == 1) + mpz_add (tmp, tmp, n); + mpz_div_2exp (tmp, tmp, 1); + } + mpz_mul_2exp (tmp, tmp, 2); /* 4d */ + mpz_sub_ui (tmp, tmp, 2); /* 4d-2 */ + + mpz_set (A, tmp); + + mpz_clear (tmp); +} + + +int gpu_ecm_stage1 (mpz_t *factors, int *array_found, mpz_t N, mpz_t s, unsigned int number_of_curves, unsigned int firstsigma, float *gputime, int verbose) { @@ -184,15 +341,15 @@ from_mont_repr (xp, N, invB); from_mont_repr (zp, N, invB); - array_stage_found[i] = findfactor (factors[i], N, xp, zp); + array_found[i] = findfactor (factors[i], N, xp, zp); - if (array_stage_found[i] != ECM_NO_FACTOR_FOUND) + if (array_found[i] != ECM_NO_FACTOR_FOUND) { - youpi = array_stage_found[i]; + youpi = array_found[i]; outputf (OUTPUT_NORMAL, "GPU: factor %Zd found in Step 1 with" " curve %u (-sigma 3:%u)\n", factors[i], i, sigma); } - } + } mpz_clear (N3); mpz_clear (invN); @@ -213,35 +370,14 @@ return youpi; } -static void -A_from_sigma (mpz_t A, unsigned int sigma, mpz_t n) -{ - mpz_t tmp; - int i; - mpz_init_set_ui (tmp, sigma); - /* Compute d = sigma/2^ECM_GPU_SIZE_DIGIT */ - for (i = 0; i < ECM_GPU_SIZE_DIGIT; i++) - { - if (mpz_tstbit (tmp, 0) == 1) - mpz_add (tmp, tmp, n); - mpz_div_2exp (tmp, tmp, 1); - } - mpz_mul_2exp (tmp, tmp, 2); /* 4d */ - mpz_sub_ui (tmp, tmp, 2); /* 4d-2 */ - - mpz_set (A, tmp); - - mpz_clear (tmp); -} - int -gpu_ecm (mpz_t f, mpz_t x, int *param, mpz_t firstsigma, mpz_t n, mpz_t go, +gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go, double *B1done, double B1, mpz_t B2min_parm, mpz_t B2_parm, unsigned long k, const int S, int verbose, int repr, int nobase2step2, int use_ntt, int sigma_is_A, FILE *os, FILE* es, char *chkfilename ATTRIBUTE_UNUSED, char *TreeFilename, double maxmem, int (*stop_asap)(void), mpz_t batch_s, double *batch_last_B1_used, - int device, int *device_init, unsigned int *nb_curves) + int use_cgbn, int device, int *device_init, unsigned int *nb_curves) { unsigned int i; int youpi = ECM_NO_FACTOR_FOUND; @@ -252,8 +388,8 @@ float gputime = 0.0; mpz_t tmp_A; mpz_t *factors = NULL; /* Contains either a factor of n either end-of-stage-1 - residue (depending of the value of array_stage_found */ - int *array_stage_found = NULL; + residue (depending of the value of array_found */ + int *array_found = NULL; /* Only for stage 2 */ int base2 = 0; /* If n is of form 2^n[+-]1, set base to [+-]n */ int Fermat = 0; /* If base2 > 0 is a power of 2, set Fermat to base2 */ @@ -268,24 +404,26 @@ ASSERT((-1 <= sigma_is_A) && (sigma_is_A <= 1)); ASSERT((GMP_NUMB_BITS == 32) || (GMP_NUMB_BITS == 64)); + /* Set global VERBOSE to avoid the need to explicitly passing verbose */ set_verbose (verbose); ECM_STDOUT = (os == NULL) ? stdout : os; ECM_STDERR = (es == NULL) ? stdout : es; /* Check that N is not too big */ - if (mpz_sizeinbase (n, 2) > ECM_GPU_MAX_BITS-6) + size_t max_bits = (use_cgbn ? ECM_GPU_CGBN_MAX_BITS : ECM_GPU_MAX_BITS) - 6; + if (mpz_sizeinbase (n, 2) > max_bits) { outputf (OUTPUT_ERROR, "GPU: Error, input number should be stricly lower" - " than 2^%d\n", ECM_GPU_MAX_BITS-6); + " than 2^%d\n", max_bits); return ECM_ERROR; } /* Only param = ECM_PARAM_BATCH_32BITS_D is accepted on GPU */ - if (*param == ECM_PARAM_DEFAULT) - *param = ECM_PARAM_BATCH_32BITS_D; + if (param == ECM_PARAM_DEFAULT) + param = ECM_PARAM_BATCH_32BITS_D; - if (*param != ECM_PARAM_BATCH_32BITS_D) + if (param != ECM_PARAM_BATCH_32BITS_D) { outputf (OUTPUT_ERROR, "GPU: Error, only param = ECM_PARAM_BATCH_32BITS_D " "is accepted on GPU.\n"); @@ -351,14 +489,16 @@ TreeFilename, maxmem, Fermat, modulus); if (youpi == ECM_ERROR) goto end_gpu_ecm; - - /* Initialize the GPU if necessary */ + /* Set cudaDeviceScheduleBlockingSync with -cgbn, else cudaDeviceScheduleYield */ + int schedule = use_cgbn ? 1 : 0; + + /* Initialize the GPU if necessary and determine nb_curves */ if (!*device_init) { st = cputime (); youpi = select_and_init_GPU (device, nb_curves, - test_verbose (OUTPUT_VERBOSE)); + test_verbose (OUTPUT_VERBOSE), schedule); if (youpi != 0) { @@ -372,18 +512,18 @@ /* try running 'nvidia-smi -q -l' on the background . */ *device_init = 1; } - + /* Init arrays */ factors = (mpz_t *) malloc (*nb_curves * sizeof (mpz_t)); ASSERT_ALWAYS (factors != NULL); - array_stage_found = (int *) malloc (*nb_curves * sizeof (int)); - ASSERT_ALWAYS (array_stage_found != NULL); + array_found = (int *) malloc (*nb_curves * sizeof (int)); + ASSERT_ALWAYS (array_found != NULL); for (i = 0; i < *nb_curves; i++) { mpz_init (factors[i]); - array_stage_found[i] = ECM_NO_FACTOR_FOUND; + array_found[i] = ECM_NO_FACTOR_FOUND; } @@ -417,7 +557,7 @@ print_B1_B2_poly (OUTPUT_NORMAL, ECM_ECM, B1, *B1done, B2min_parm, B2min, B2, S, firstsigma, sigma_is_A, ECM_EC_TYPE_MONTGOMERY, - go, *param, *nb_curves); + go, param, *nb_curves); outputf (OUTPUT_VERBOSE, "dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n", dF, k, root_params.d1, root_params.d2, root_params.i0); @@ -438,13 +578,24 @@ else { rhoinit (256, 10); - print_expcurves (B1, B2, dF, k, root_params.S, *param); + print_expcurves (B1, B2, dF, k, root_params.S, param); } } - + st = cputime (); - youpi = gpu_ecm_stage1 (factors, array_stage_found, n, batch_s, *nb_curves, - firstsigma_ui, &gputime, verbose); + + if (use_cgbn) { +#ifdef HAVE_CGBN_H + youpi = cgbn_ecm_stage1 (factors, array_found, n, batch_s, *nb_curves, + firstsigma_ui, &gputime, verbose); +#else + outputf (OUTPUT_ERROR, "cgbn not included"); + return ECM_ERROR; +#endif /* HAVE_CGBN_H */ + } else { + youpi = gpu_ecm_stage1 (factors, array_found, n, batch_s, *nb_curves, + firstsigma_ui, &gputime, verbose); + } outputf (OUTPUT_NORMAL, "Computing %u Step 1 took %ldms of CPU time / " "%.0fms of GPU time\n", *nb_curves, @@ -457,12 +608,15 @@ *B1done=B1; - /* Save stage 1 residues */ + /* GMP documentation says mpz_sizeinbase(op, 2) is always the exact value. */ + size_t n_bits = mpz_sizeinbase(n, 2); + + /* Save stage 1 residues as x = x0 + x1 * 2^bits + ... + xk * 2^(bits*k) */ mpz_set_ui (x, 0); for (i = 0; i < *nb_curves; i++) { - mpz_mul (x, x, n); - mpz_add (x, x, factors[i]); + mpz_mul_2exp (x, x, n_bits); + mpz_add (x, x, factors[*nb_curves - 1 - i]); } /* was a factor found in stage 1 ? */ @@ -492,6 +646,10 @@ for (i = 0; i < *nb_curves; i++) { + /* hack to reduce verbose Step 2 */ + if (verbose > 0) + set_verbose (verbose-1); + if (test_verbose (OUTPUT_RESVERBOSE)) outputf (OUTPUT_RESVERBOSE, "x=%Zd\n", factors[i]); @@ -524,19 +682,16 @@ t); mpz_clear (t); } - - /* It is a hack to avoid very verbose Step 2 - (without it, stage2() prints a least a line by curves) */ - if (!test_verbose (OUTPUT_VERBOSE)) - set_verbose (0); + youpi = stage2 (factors[i], &P, modulus, dF, k, &root_params, use_ntt, - TreeFilename, stop_asap); + TreeFilename, i+1, stop_asap); + + next_curve: set_verbose (verbose); - next_curve: if (youpi != ECM_NO_FACTOR_FOUND) { - array_stage_found[i] = youpi; + array_found[i] = youpi; outputf (OUTPUT_NORMAL, "GPU: factor %Zd found in Step 2 with" " curve %u (-sigma 3:%u)\n", factors[i], i, i+firstsigma_ui); /* factor_found corresponds to the first factor found */ @@ -568,11 +723,14 @@ if (youpi == ECM_NO_FACTOR_FOUND && (stop_asap == NULL || !(*stop_asap)())) print_exptime (B1, B2, dF, k, root_params.S, - (long) (tottime / *nb_curves), *param); + (long) (tottime / *nb_curves), param); rhoinit (1, 0); /* Free memory of rhotable */ } } + + reducefactors(factors, array_found, *nb_curves); + /* If f0, ,fk are the factors found (in stage 1 or 2) * f = f0 + f1*n + .. + fk*n^k * The purpose of this construction is to be able to return more than one @@ -581,10 +739,11 @@ mpz_set_ui (f, 0); for (i = 0; i < *nb_curves; i++) { - if (array_stage_found[i] != ECM_NO_FACTOR_FOUND) + /* invert order of factors so they are processed in same order found */ + if (array_found[*nb_curves-1-i] != ECM_NO_FACTOR_FOUND) { mpz_mul (f, f, n); - mpz_add (f, f, factors[i]); + mpz_add (f, f, factors[*nb_curves-1-i]); } } @@ -596,7 +755,7 @@ for (i = 0; i < *nb_curves; i++) mpz_clear (factors[i]); - free (array_stage_found); + free (array_found); free (factors); end_gpu_ecm2: @@ -608,7 +767,5 @@ return youpi; } -#endif - - +#endif /* HAVE_GPU */ diff -Nru gmp-ecm-7.0.4+ds/debian/changelog gmp-ecm-7.0.5+ds/debian/changelog --- gmp-ecm-7.0.4+ds/debian/changelog 2021-10-14 13:33:37.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/changelog 2022-06-06 14:24:47.000000000 +0000 @@ -1,3 +1,27 @@ +gmp-ecm (7.0.5+ds-1) unstable; urgency=medium + + * New upstream nano version. + * Debianization: + - d/watch, harden; + - d/copyright: + - copyright year-tuples, update; + - Files-Excluded list, refresh; + - Comment field, erase; + - d/control: + - Maintainer, now Debian Math Team; + - Vcs-*, migration to math-team; + - Standards-Version, bump to 4.6.1 (no change); + - Build-Depends list: + - xsltproc and docbook-xsl, add (to generate manpage); + - libgmp-dev, remove version; + - d/patches/*: + - d/p/upstream-fix-manpage-acute_accent.patch, useless; + - d/rules, harden; + - d/clean, introduce; + - d/upstream/metadata, introduce. + + -- Jerome Benoit Mon, 06 Jun 2022 14:24:47 +0000 + gmp-ecm (7.0.4+ds-6) unstable; urgency=medium * Debianization: diff -Nru gmp-ecm-7.0.4+ds/debian/clean gmp-ecm-7.0.5+ds/debian/clean --- gmp-ecm-7.0.4+ds/debian/clean 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/clean 2022-06-06 13:28:33.000000000 +0000 @@ -0,0 +1 @@ +m4/ diff -Nru gmp-ecm-7.0.4+ds/debian/control gmp-ecm-7.0.5+ds/debian/control --- gmp-ecm-7.0.4+ds/debian/control 2021-10-14 13:23:47.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/control 2022-06-06 13:58:50.000000000 +0000 @@ -1,17 +1,18 @@ Source: gmp-ecm Section: math Priority: optional -Maintainer: Debian Science Maintainers +Maintainer: Debian Math Team Uploaders: Jerome Benoit Rules-Requires-Root: no Build-Depends: debhelper-compat (= 13), gnulib, m4, libtool, - libgmp-dev (>= 2:6.1) -Standards-Version: 4.6.0 + libgmp-dev, + xsltproc, docbook-xsl +Standards-Version: 4.6.1 Homepage: https://gitlab.inria.fr/zimmerma/ecm -Vcs-Git: https://salsa.debian.org/science-team/gmp-ecm.git -Vcs-Browser: https://salsa.debian.org/science-team/gmp-ecm +Vcs-Git: https://salsa.debian.org/math-team/gmp-ecm.git +Vcs-Browser: https://salsa.debian.org/math-team/gmp-ecm Package: gmp-ecm Architecture: any diff -Nru gmp-ecm-7.0.4+ds/debian/copyright gmp-ecm-7.0.5+ds/debian/copyright --- gmp-ecm-7.0.4+ds/debian/copyright 2021-10-14 13:26:22.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/copyright 2022-06-06 14:18:22.000000000 +0000 @@ -2,46 +2,16 @@ Upstream-Name: gmp-ecm Upstream-Contact: Paul Zimmermann Source: https://gitlab.inria.fr/zimmerma/ecm -Comment: - The upstream source tarball is repacked to drop off some substantial - weight and to allow clean git-buildpackage builds by mainly cleaning - up regenerated files (mainly autotools related material). Files-Excluded: - build.vc12/ecm/Makefile.in - build.vc12/libecm/Makefile.in - build.vc12/tune/Makefile.in - build.vc12/libecm_gpu/Makefile.in - build.vc12/bench_mulredc/Makefile.in - build.vc12/assembler/Makefile.in - build.vc12/ecm_gpu/Makefile.in - build.vc12/Makefile.in - athlon/Makefile.in - pentium4/Makefile.in - powerpc64/Makefile.in - x86_64/Makefile.in - aprtcle/Makefile.in - m4/ltversion.m4 - m4/ltoptions.m4 - m4/libtool.m4 - m4/ltsugar.m4 - m4/lt~obsolete.m4 - aclocal.m4 - config.h.in - Makefile.in - compile - missing - ltmain.sh - install-sh - depcomp - test-driver - config.sub - config.guess - configure - INSTALL + .gitlab-ci.yml + .gitignore + .mailmap + m4 + ecm.1 Files: * Copyright: - Copyright (C) 2001-2021 the GMP-ECM Project Team + Copyright (C) 2001-2022 the GMP-ECM Project Team Paul Zimmermann Cyril Bouvier David Cleaver @@ -80,7 +50,7 @@ bestd.c ecm2.c ecm.c - ecm.h + ecm.h.in ecm-impl.h ecm_ntt.c factor.c @@ -116,7 +86,6 @@ aprtcle/mpz_aprcl.c aprtcle/mpz_aprcl.h powerpc64/redc.asm - powerpc64/mulredc*.asm powerpc64/mulredc*.m4 powerpc64/powerpc-defs.m4 Copyright: @@ -157,7 +126,7 @@ Files: debian/* Copyright: - 2015-2021 Jerome Benoit + 2015-2022 Jerome Benoit 2003-2014 Laurent Fousse License: GPL-3+ Comment: diff -Nru gmp-ecm-7.0.4+ds/debian/patches/series gmp-ecm-7.0.5+ds/debian/patches/series --- gmp-ecm-7.0.4+ds/debian/patches/series 2021-10-14 12:04:17.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/patches/series 2022-06-06 13:43:44.000000000 +0000 @@ -1,4 +1,3 @@ #upstream-libtoolization-version_script.patch -upstream-fix-manpage-acute_accent.patch upstream-national_encoding.patch debianization-examples.patch diff -Nru gmp-ecm-7.0.4+ds/debian/patches/upstream-fix-manpage-acute_accent.patch gmp-ecm-7.0.5+ds/debian/patches/upstream-fix-manpage-acute_accent.patch --- gmp-ecm-7.0.4+ds/debian/patches/upstream-fix-manpage-acute_accent.patch 2021-10-14 13:23:05.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/patches/upstream-fix-manpage-acute_accent.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,61 +0,0 @@ -Description: upstream fix: manpage: technical typos - Fix acute-accent typos in ecm manpage as reported by lintian, see - - for furhter information. -Origin: vendor, Debian -Forwarded: by-email -Author: Jerome Benoit -Last-Update: 2021-10-14 - ---- a/ecm.1 -+++ b/ecm.1 -@@ -158,8 +158,8 @@ - \fB\-power \fR\fB\fIn\fR\fR - .RS 4 - [ECM, P\-1] Use x^\fIn\fR --for Brent\-Suyama\'s extension (\fB\-power 1\fR --disables Brent\-Suyama\'s extension)\&. The default polynomial is chosen depending on the method and B2\&. For P\-1 and P+1, disables the fast stage 2\&. For P\-1, -+for Brent\-Suyama's extension (\fB\-power 1\fR -+disables Brent\-Suyama's extension)\&. The default polynomial is chosen depending on the method and B2\&. For P\-1 and P+1, disables the fast stage 2\&. For P\-1, - \fIn\fR - must be even\&. - .RE -@@ -167,7 +167,7 @@ - \fB\-dickson \fR\fB\fIn\fR\fR - .RS 4 - [ECM, P\-1] Use degree\-\fIn\fR --Dickson\'s polynomial for Brent\-Suyama\'s extension\&. For P\-1 and P+1, disables the fast stage 2\&. Like for -+Dickson's polynomial for Brent\-Suyama's extension\&. For P\-1 and P+1, disables the fast stage 2\&. Like for - \fB\-power\fR, - \fIn\fR - must be even for P\-1\&. -@@ -212,17 +212,17 @@ - .PP - \fB\-mpzmod\fR - .RS 4 --Use GMP\'s mpz_mod function (sub\-quadratic for large inputs, but induces some overhead for small ones)\&. -+Use GMP's mpz_mod function (sub\-quadratic for large inputs, but induces some overhead for small ones)\&. - .RE - .PP - \fB\-modmuln\fR - .RS 4 --Use Montgomery\'s multiplication (quadratic version)\&. Usually best method for small input\&. -+Use Montgomery's multiplication (quadratic version)\&. Usually best method for small input\&. - .RE - .PP - \fB\-redc\fR - .RS 4 --Use Montgomery\'s multiplication (sub\-quadratic version)\&. Theoretically optimal for large input\&. -+Use Montgomery's multiplication (sub\-quadratic version)\&. Theoretically optimal for large input\&. - .RE - .PP - \fB\-nobase2\fR -@@ -241,7 +241,7 @@ - .RE - .SH "FILE I/O" - .PP --The following options enable one to perform step 1 and step 2 separately, either on different machines, at different times, or using different software (in particular, George Woltman\'s Prime95/mprime program can produce step 1 output suitable for resuming with GMP\-ECM)\&. It can also be useful to split step 2 into several runs, using the -+The following options enable one to perform step 1 and step 2 separately, either on different machines, at different times, or using different software (in particular, George Woltman's Prime95/mprime program can produce step 1 output suitable for resuming with GMP\-ECM)\&. It can also be useful to split step 2 into several runs, using the - \fIB2min\-B2max\fR - option\&. - .PP diff -Nru gmp-ecm-7.0.4+ds/debian/patches/upstream-national_encoding.patch gmp-ecm-7.0.5+ds/debian/patches/upstream-national_encoding.patch --- gmp-ecm-7.0.4+ds/debian/patches/upstream-national_encoding.patch 2021-10-14 13:22:44.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/patches/upstream-national_encoding.patch 2022-06-06 12:52:35.000000000 +0000 @@ -18,7 +18,7 @@ and the NTT code. Jason S. Papadopoulos contributed optimizations to the NTT code. -@@ -30,7 +30,7 @@ +@@ -32,7 +32,7 @@ author of the first version of the program. Several people also helped by suggesting improvements, or testing diff -Nru gmp-ecm-7.0.4+ds/debian/rules gmp-ecm-7.0.5+ds/debian/rules --- gmp-ecm-7.0.4+ds/debian/rules 2021-10-12 16:21:32.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/rules 2022-06-06 14:05:13.000000000 +0000 @@ -1,4 +1,5 @@ #!/usr/bin/make -f +include /usr/share/dpkg/pkg-info.mk DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) @@ -8,13 +9,19 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all +export ACLOCAL_PATH=/usr/share/gnulib/m4 + +export XSLDIR=/usr/share/xml/docbook/stylesheet/docbook-xsl + CFLAGS := $(subst -O2,-O3,$(CFLAGS)) +BUILDIR=_BUILD + default: @uscan --no-conf --dehs --report || true %: - dh $@ --builddirectory=_build + dh $@ --builddirectory=$(BUILDIR) override_dh_auto_configure: dh_auto_configure -- --enable-shared --enable-maintainer-mode @@ -26,7 +33,7 @@ true override_dh_auto_install-indep: - $(MAKE) -C _build install-data-am DESTDIR=$(CURDIR)/debian/tmp + $(MAKE) -C $(BUILDIR) install-data-am DESTDIR=$(CURDIR)/debian/tmp override_dh_compress-indep: dh_compress -Xexamples diff -Nru gmp-ecm-7.0.4+ds/debian/upstream/metadata gmp-ecm-7.0.5+ds/debian/upstream/metadata --- gmp-ecm-7.0.4+ds/debian/upstream/metadata 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/upstream/metadata 2022-06-06 13:20:35.000000000 +0000 @@ -0,0 +1,4 @@ +Bug-Database: https://gitlab.inria.fr/zimmerma/ecm/-/issues +Bug-Submit: https://gitlab.inria.fr/zimmerma/ecm/-/issues/new +Repository: https://gitlab.inria.fr/zimmerma/ecm.git +Repository-Browse: https://gitlab.inria.fr/zimmerma/ecm diff -Nru gmp-ecm-7.0.4+ds/debian/watch gmp-ecm-7.0.5+ds/debian/watch --- gmp-ecm-7.0.4+ds/debian/watch 2021-10-14 11:40:05.000000000 +0000 +++ gmp-ecm-7.0.5+ds/debian/watch 2022-06-06 12:30:03.000000000 +0000 @@ -1,3 +1,7 @@ version=4 opts=repack,compression=xz,dversionmangle=s/\+ds//,repacksuffix=+ds,uversionmangle=s/-(rc\d*)$/~$1/ \ -https://gitlab.inria.fr/zimmerma/ecm/-/tags?sort=updated_desc .*/archive/(\d\S+)/.*\.tar\.bz2 +https://gitlab.inria.fr/zimmerma/ecm/tags?sort=updated_desc archive/(?:git-|)@ANY_VERSION@/ecm-(?:git-|)\d\S*@ARCHIVE_EXT@ + +## https://gitlab.inria.fr/zimmerma/ecm/-/archive/git-7.0.5/ecm-git-7.0.5.tar.gz +## https://gitlab.inria.fr/zimmerma/ecm/-/archive/7.0.4/ecm-7.0.4.tar.gz +## https://gitlab.inria.fr/zimmerma/ecm/-/archive/git-7.0.3/ecm-git-7.0.3.tar.gz diff -Nru gmp-ecm-7.0.4+ds/dummy2.save gmp-ecm-7.0.5+ds/dummy2.save --- gmp-ecm-7.0.4+ds/dummy2.save 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/dummy2.save 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ + + + # this is a comment line and should be ignored diff -Nru gmp-ecm-7.0.4+ds/dummy.save gmp-ecm-7.0.5+ds/dummy.save --- gmp-ecm-7.0.4+ds/dummy.save 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/dummy.save 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +METHOD ECM; PARAM=0; SIGMA=585928442; B1=174000; N=17061648125571273329563156588435816942778260706938821014533; X=0x1a2a694df04a5c037fd12f42668b474f16b7818933f4c8484; CHECKSUM=1505596339; PROGRAM=GMP-ECM 7.0-dev; Y=0x0; X0=0x0; Y0=0x0; WHO=zimmerma@tarte; TIME=Tue Apr 7 11:00:55 2015; +METHOD=ECM; PARAM=0; SIGMA=585928442; B1=174000; N=17061648125571273329563156588435816942778260706938821014533; X=0x1a2a694df04a5c037fd12f42668b474f16b7818933f4c8484; CHECKSUM=505596339; PROGRAM=GMP-ECM 7.0-dev; Y=0x0; X0=0x0; Y0=0x0; WHO=zimmerma@tarte; TIME=Tue Apr 7 11:00:55 2015; +METHOD=ECM; PARAM=0; B1=174000; N=17061648125571273329563156588435816942778260706938821014533; X=0x1a2a694df04a5c037fd12f42668b474f16b7818933f4c8484; CHECKSUM=1505596339; PROGRAM=GMP-ECM 7.0-dev; Y=0x0; X0=0x0; Y0=0x0; WHO=zimmerma@tarte; TIME=Tue Apr 7 11:00:55 2015; diff -Nru gmp-ecm-7.0.4+ds/ecm.1 gmp-ecm-7.0.5+ds/ecm.1 --- gmp-ecm-7.0.4+ds/ecm.1 2016-05-24 15:31:33.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm.1 1970-01-01 00:00:00.000000000 +0000 @@ -1,505 +0,0 @@ -'\" t -.\" Title: ECM -.\" Author: [see the "AUTHORS" section] -.\" Generator: DocBook XSL Stylesheets v1.75.2 -.\" Date: 03/01/2013 -.\" Manual: April 22, 2003 -.\" Source: April 22, 2003 -.\" Language: English -.\" -.TH "ECM" "1" "03/01/2013" "April 22, 2003" "April 22, 2003" -.\" ----------------------------------------------------------------- -.\" * set default formatting -.\" ----------------------------------------------------------------- -.\" disable hyphenation -.nh -.\" disable justification (adjust text to left margin only) -.ad l -.\" ----------------------------------------------------------------- -.\" * MAIN CONTENT STARTS HERE * -.\" ----------------------------------------------------------------- -.SH "NAME" -ecm \- integer factorization using ECM, P\-1 or P+1 -.SH "SYNOPSIS" -.HP \w'\fBecm\fR\ 'u -\fBecm\fR [\fBoptions\fR] \fIB1\fR [\fIB2min\fR\-\fIB2max\fR | \fIB2\fR] -.br - -.SH "DESCRIPTION" -.PP -ecm is an integer factoring program using the Elliptic Curve Method (ECM), the P\-1 method, or the P+1 method\&. The following sections describe parameters relevant to these algorithms\&. -.SH "STEP 1 AND STEP 2 BOUND PARAMETERS" -.PP -\fB\fIB1\fR\fR -.RS 4 -\fIB1\fR -is the step 1 bound\&. It is a mandatory parameter\&. It can be given either in integer format (for example 3000000) or in floating\-point format (3000000\&.0 or 3e6)\&. The largest possible -\fIB1\fR -value is 9007199254740996 for P\-1, and ULONG_MAX or 9007199254740996 (whichever is smaller) for ECM and P+1\&. All primes 2 <= p <= -\fIB1\fR -are processed in step 1\&. -.RE -.PP -\fB\fIB2\fR\fR -.RS 4 -\fIB2\fR -is the step 2 bound\&. It is optional: if omitted, a default value is computed from -\fIB1\fR, which should be close to optimal\&. Like -\fIB1\fR, it can be given either in integer or in floating\-point format\&. The largest possible value of -\fIB2\fR -is approximately 9e23, but depends on the number of blocks -\fIk\fR -if you specify the -\fB\-k\fR -option\&. All primes -\fIB1\fR -<= p <= -\fIB2\fR -are processed in step 2\&. If -\fIB2\fR -< -\fIB1\fR, no step 2 is performed\&. -.RE -.PP -\fB\fIB2min\fR\fR\fB\-\fR\fB\fIB2max\fR\fR -.RS 4 -alternatively one may use the -\fIB2min\fR\-\fIB2max\fR -form, which means that all primes -\fIB2min\fR -<= p <= -\fIB2max\fR -should be processed\&. Thus specifying -\fIB2\fR -only corresponds to -\fIB1\fR\-\fIB2\fR\&. The values of -\fIB2min\fR -and -\fIB2max\fR -may be arbitrarily large, but their difference must not exceed approximately 9e23, subject to the number of blocks -\fIk\fR\&. -.RE -.SH "FACTORING METHOD" -.PP -\fB\-pm1\fR -.RS 4 -Perform P\-1 instead of the default method (ECM)\&. -.RE -.PP -\fB\-pp1\fR -.RS 4 -Perform P+1 instead of the default method (ECM)\&. -.RE -.SH "GROUP AND INITIAL POINT PARAMETERS" -.PP -\fB\-x0 \fR\fB\fIx\fR\fR -.RS 4 -[ECM, P\-1, P+1] Use -\fIx\fR -(arbitrary\-precision integer or rational) as initial point\&. For example, -\fB\-x0 1/3\fR -is valid\&. If not given, -\fIx\fR -is generated from the sigma value for ECM, or at random for P\-1 and P+1\&. -.RE -.PP -\fB\-sigma \fR\fB\fIs\fR\fR -.RS 4 -[ECM] Use -\fIs\fR -(arbitrary\-precision integer) as curve generator\&. If omitted, -\fIs\fR -is generated at random\&. -.RE -.PP -\fB\-A \fR\fB\fIa\fR\fR -.RS 4 -[ECM] Use -\fIa\fR -(arbitrary\-precision integer) as curve parameter\&. If omitted, is it generated from the sigma value\&. -.RE -.PP -\fB\-go \fR\fB\fIval\fR\fR -.RS 4 -[ECM, P\-1, P+1] Multiply the initial point by -\fIval\fR, which can any valid expression, possibly containing the special character N as place holder for the current input number\&. Example: -.sp -.if n \{\ -.RS 4 -.\} -.nf -ecm \-pp1 \-go "N^2\-1" 1e6 < composite2000 -.fi -.if n \{\ -.RE -.\} -.sp -.RE -.SH "STEP 2 PARAMETERS" -.PP -\fB\-k \fR\fB\fIk\fR\fR -.RS 4 -[ECM, P\-1, P+1] Perform -\fIk\fR -blocks in step 2\&. For a given -\fIB2\fR -value, increasing -\fIk\fR -decreases the memory usage of step 2, at the expense of more cpu time\&. -.RE -.PP -\fB\-treefile \fR\fB\fIfile\fR\fR -.RS 4 -Stores some tables of data in disk files to reduce the amount of memory occupied in step 2, at the expense of disk I/O\&. Data will be written to files -\fIfile\fR\&.1, -\fIfile\fR\&.2 etc\&. Does not work with fast stage 2 for P+1 and P\-1\&. -.RE -.PP -\fB\-power \fR\fB\fIn\fR\fR -.RS 4 -[ECM, P\-1] Use x^\fIn\fR -for Brent\-Suyama\'s extension (\fB\-power 1\fR -disables Brent\-Suyama\'s extension)\&. The default polynomial is chosen depending on the method and B2\&. For P\-1 and P+1, disables the fast stage 2\&. For P\-1, -\fIn\fR -must be even\&. -.RE -.PP -\fB\-dickson \fR\fB\fIn\fR\fR -.RS 4 -[ECM, P\-1] Use degree\-\fIn\fR -Dickson\'s polynomial for Brent\-Suyama\'s extension\&. For P\-1 and P+1, disables the fast stage 2\&. Like for -\fB\-power\fR, -\fIn\fR -must be even for P\-1\&. -.RE -.PP -\fB\-maxmem \fR\fB\fIn\fR\fR -.RS 4 -Use at most -\fIn\fR -megabytes of memory in stage 2\&. -.RE -.PP -\fB\-ntt\fR, \fB\-no\-ntt\fR -.RS 4 -Enable or disable the Number\-Theoretic Transform code for polynomial arithmetic in stage 2\&. With NTT, dF is chosen to be a power of 2, and is limited by the number suitable primes that fit in a machine word (which is a limitation only on 32 bit systems)\&. The \-no\-ntt variant uses more memory, but is faster than NTT with large input numbers\&. By default, NTT is used for P\-1, P+1 and for ECM on numbers of size at most 30 machine words\&. -.RE -.SH "OUTPUT" -.PP -\fB\-q\fR -.RS 4 -Quiet mode\&. Found factorizations are printed on standard output, with factors separated by white spaces, one line per input number (if no factor was found, the input number is simply copied)\&. -.RE -.PP -\fB\-v\fR -.RS 4 -Verbose mode\&. More information is printed, more -\fB\-v\fR -options increase verbosity\&. With one -\fB\-v\fR, the kind of modular multiplication used, initial x0 value, step 2 parameters and progress, and expected curves and time to find factors of different sizes for ECM are printed\&. With -\fB\-v \-v\fR, the A value for ECM and residues at the end of step 1 and step 2 are printed\&. More -\fB\-v\fR -print internal data for debugging\&. -.RE -.PP -\fB\-timestamp\fR -.RS 4 -Print a time stamp whenever a new ECM curve or P+1 or P\-1 run is processed\&. -.RE -.SH "MODULAR ARITHMETIC OPTIONS" -.PP -Several algorithms are available for modular multiplication\&. The program tries to find the best one for each input; one can force a given method with the following options\&. -.PP -\fB\-mpzmod\fR -.RS 4 -Use GMP\'s mpz_mod function (sub\-quadratic for large inputs, but induces some overhead for small ones)\&. -.RE -.PP -\fB\-modmuln\fR -.RS 4 -Use Montgomery\'s multiplication (quadratic version)\&. Usually best method for small input\&. -.RE -.PP -\fB\-redc\fR -.RS 4 -Use Montgomery\'s multiplication (sub\-quadratic version)\&. Theoretically optimal for large input\&. -.RE -.PP -\fB\-nobase2\fR -.RS 4 -Disable special base\-2 code (which is used when the input number is a large factor of 2^n+1 or 2^n\-1, see -\fB\-v\fR)\&. -.RE -.PP -\fB\-base2\fR \fIn\fR -.RS 4 -Force use of special base\-2 code, input number must divide 2^\fIn\fR+1 if -\fIn\fR -> 0, or 2^|\fIn\fR|\-1 if -\fIn\fR -< 0\&. -.RE -.SH "FILE I/O" -.PP -The following options enable one to perform step 1 and step 2 separately, either on different machines, at different times, or using different software (in particular, George Woltman\'s Prime95/mprime program can produce step 1 output suitable for resuming with GMP\-ECM)\&. It can also be useful to split step 2 into several runs, using the -\fIB2min\-B2max\fR -option\&. -.PP -\fB\-inp \fR\fB\fIfile\fR\fR -.RS 4 -Take input from file -\fIfile\fR -instead of from standard input\&. -.RE -.PP -\fB\-save \fR\fB\fIfile\fR\fR -.RS 4 -Save result of step 1 in -\fIfile\fR\&. If -\fIfile\fR -exists, an error is raised\&. Example: to perform only step 1 with -\fIB1\fR=1000000 on the composite number in the file "c155" and save its result in file "foo", use -.sp -.if n \{\ -.RS 4 -.\} -.nf -ecm \-save foo 1e6 1 < c155 -.fi -.if n \{\ -.RE -.\} -.sp -.RE -.PP -\fB\-savea \fR\fB\fIfile\fR\fR -.RS 4 -Like -\fB\-save\fR, but appends to existing files\&. -.RE -.PP -\fB\-resume \fR\fB\fIfile\fR\fR -.RS 4 -Resume residues from -\fIfile\fR, reads from standard input if -\fIfile\fR -is "\-"\&. Example: to perform step 2 following the above step 1 computation, use -.sp -.if n \{\ -.RS 4 -.\} -.nf -ecm \-resume foo 1e6 -.fi -.if n \{\ -.RE -.\} -.sp -.RE -.PP -\fB\-chkpoint \fR\fB\fIfile\fR\fR -.RS 4 -Periodically write the current residue in stage 1 to -\fIfile\fR\&. In case of a power failure, etc\&., the computation can be continued with the -\fB\-resume\fR -option\&. -.sp -.if n \{\ -.RS 4 -.\} -.nf -ecm \-chkpnt foo \-pm1 1e10 < largenumber\&.txt -.fi -.if n \{\ -.RE -.\} -.sp -.RE -.SH "LOOP MODE" -.PP -The -\(lqloop mode\(rq -(option -\fB\-c \fR\fB\fIn\fR\fR) enables one to run several curves on each input number\&. The following options control its behavior\&. -.PP -\fB\-c \fR\fB\fIn\fR\fR -.RS 4 -Perform -\fIn\fR -runs on each input number (default is one)\&. This option is mainly useful for P+1 (for example with -\fIn\fR=3) or for ECM, where -\fIn\fR -could be set to the expected number of curves to find a d\-digit factor with a given step 1 bound\&. This option is incompatible with -\fB\-resume, \-sigma, \-x0\fR\&. Giving -\fB\-c 0\fR -produces an infinite loop until a factor is found\&. -.RE -.PP -\fB\-one\fR -.RS 4 -In loop mode, stop when a factor is found; the default is to continue until the cofactor is prime or the specified number of runs are done\&. -.RE -.PP -\fB\-b\fR -.RS 4 -Breadth\-first processing: in loop mode, run one curve for each input number, then a second curve for each one, and so on\&. This is the default mode with -\fB\-inp\fR\&. -.RE -.PP -\fB\-d\fR -.RS 4 -Depth\-first processing: in loop mode, run -\fIn\fR -curves for the first number, then -\fIn\fR -curves for the second one and so on\&. This is the default mode with standard input\&. -.RE -.PP -\fB\-I \fR\fB\fIn\fR\fR -.RS 4 -In loop mode, multiply -\fIB1\fR -by a factor depending on -\fIn\fR -after each curve\&. Default is one which should be optimal on one machine, while -\fB\-I 10\fR -could be used when trying to factor the same number simultaneously on 10 identical machines\&. -.RE -.SH "SHELL COMMAND EXECUTION" -.PP -These options allow for executing shell commands to supplement functionality to GMP\-ECM\&. -.PP -.RE -.SH "MISCELLANEOUS" -.PP -\fB\-stage1time \fR\fB\fIn\fR\fR -.RS 4 -Add -\fIn\fR -seconds to stage 1 time\&. This is useful to get correct expected time with -\fI\-v\fR -if part of stage 1 was done in another run\&. -.RE -.PP -\fB\-h\fR, \fB\-\-help\fR -.RS 4 -Display a short description of ecm usage, parameters and command line options\&. -.RE -.PP -\fB\-printconfig\fR -.RS 4 -Prints configuration parameters used for the compilation and exits\&. -.RE -.SH "INPUT SYNTAX" -.PP -The input numbers can have several forms: -.PP -Raw decimal numbers like 123456789\&. -.PP -Comments can be placed in the file: everything after -\(lq//\(rq -is ignored, up to the end of line\&. -.PP -Line continuation\&. If a line ends with a backslash character -\(lq\e\(rq, it is considered to continue on the next line\&. -.PP -Common arithmetic expressions can be used\&. Example: -\fI3*5+2^10\fR\&. -.PP -Factorial: example -\fI53!\fR\&. -.PP -Multi\-factorial: example -\fI15!3\fR -means 15*12*9*6*3\&. -.PP -Primorial: example -\fI11#\fR -means 2*3*5*7*11\&. -.PP -Reduced primorial: example -\fI17#5\fR -means 5*7*11*13*17\&. -.PP -Functions: currently, the only available function is -\fIPhi(x,n)\fR\&. -.SH "EXIT STATUS" -.PP -The exit status reflects the result of the last ECM curve or P\-1/P+1 attempt the program performed\&. Individual bits signify particular events, specifically: -.PP -Bit 0 -.RS 4 -0 if normal program termination, 1 if error occurred -.RE -.PP -Bit 1 -.RS 4 -0 if no proper factor was found, 1 otherwise -.RE -.PP -Bit 2 -.RS 4 -0 if factor is composite, 1 if factor is a probable prime -.RE -.PP -Bit 3 -.RS 4 -0 if cofactor is composite, 1 if cofactor is a probable prime -.RE -.PP -Thus, the following exit status values may occur: -.PP -0 -.RS 4 -Normal program termination, no factor found -.RE -.PP -1 -.RS 4 -Error -.RE -.PP -2 -.RS 4 -Composite factor found, cofactor is composite -.RE -.PP -6 -.RS 4 -Probable prime factor found, cofactor is composite -.RE -.PP -8 -.RS 4 -Input number found -.RE -.PP -10 -.RS 4 -Composite factor found, cofactor is a probable prime -.RE -.PP -14 -.RS 4 -Probable prime factor found, cofactor is a probable prime -.RE -.SH "BUGS" -.PP -Report bugs to , after checking for bug fixes or new versions\&. -.SH "AUTHORS" -.PP -Pierrick Gaudry contributed efficient assembly code for combined mul/redc; -.PP -Jim Fougeron contributed the expression parser and several command\-line options; -.PP -Laurent Fousse contributed the middle product code, the autoconf/automake tools, and is the maintainer of the Debian package; -.PP -Alexander Kruppa <(lastname)al@loria\&.fr> contributed estimates for probability of success for ECM, the new P+1 and P\-1 stage 2 (with P\&.\-L\&. Montgomery), new AMD64 asm mulredc code, and some other things; -.PP -Dave Newman contributed the Kronecker\-Schoenhage and NTT multiplication code; -.PP -Jason S\&. Papadopoulos contributed a speedup of the NTT code -.PP -Paul Zimmermann is the author of the first version of the program and chief maintainer of GMP\-ECM\&. -.PP -Note: email addresses have been obscured, the required substitutions should be obvious\&. diff -Nru gmp-ecm-7.0.4+ds/ecm2.c gmp-ecm-7.0.5+ds/ecm2.c --- gmp-ecm-7.0.4+ds/ecm2.c 2016-02-23 14:03:23.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm2.c 2022-06-06 14:16:49.000000000 +0000 @@ -37,8 +37,8 @@ mpmod_t modulus, mpres_t u, mpres_t v, mpres_t *T, unsigned long *tot_muls, unsigned long *tot_gcds) { - unsigned int i, maxbit, k; /* k is the number of values to batch invert */ - unsigned int l, t, muls = 0, gcds = 0; + ecm_uint i, maxbit, k; /* k is the number of values to batch invert */ + ecm_uint l, t, muls = 0, gcds = 0; #ifdef WANT_EXPCOST unsigned int hamweight = 0; #endif @@ -104,8 +104,8 @@ mpres_add (T[k++], s.y, s.y, modulus); for (i = 0; i < n && !youpi; i++) - if (mpz_tstbit (q[i], t)) /* If q[i] & (1< 0) @@ -138,9 +138,9 @@ l = k - 1; for (i = n; i-- > 0; ) /* Go through the R[i] again, backwards */ - if (mpz_tstbit (q[i], t)) + if (ecm_tstbit (q[i], t)) { - if (mpz_tstbit (flag, i)) + if (ecm_tstbit (flag, i)) { /* T[k] contains 1/(v[0]*...*v[l]) */ if (l > 0) /* need to separate the values */ @@ -230,7 +230,7 @@ /* Now take inverse points (negative y-coordinate) where q[i] was < 0 */ for (i = 0; i < n; i++) - if (mpz_tstbit (signs, i)) + if (ecm_tstbit (signs, i)) { mpz_neg (R[i].y, R[i].y); mpz_neg (q[i], q[i]); diff -Nru gmp-ecm-7.0.4+ds/ecmbench gmp-ecm-7.0.5+ds/ecmbench --- gmp-ecm-7.0.4+ds/ecmbench 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecmbench 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,19 @@ +#!/bin/csh +set ECM=$1 +echo Benchmark of $ECM +@ n = 1 +/bin/rm -f ecmbench.data +while ("$n" != "26") + set t0=`echo "3*2^(64*$n-2)-1" | $ECM -sigma 6 1e6 1 |& grep "Step 1" | sed 's/Step 1 took//g' | sed 's/ms//g'` + set t1=`echo "3*2^(64*$n-2)-1" | $ECM -modmuln -sigma 6 1e6 1 |& grep "Step 1" | sed 's/Step 1 took//g' | sed 's/ms//g'` + set t2=`echo "3*2^(64*$n-2)-1" | $ECM -mpzmod -sigma 6 1e6 1 |& grep "Step 1" | sed 's/Step 1 took//g' | sed 's/ms//g'` + set t3=`echo "3*2^(64*$n-2)-1" | $ECM -redc -sigma 6 1e6 1 |& grep "Step 1" | sed 's/Step 1 took//g' | sed 's/ms//g'` + echo $n " " $t0 " " $t1 " " $t2 " " $t3 >> ecmbench.data + @ n = $n + 1 +end +gnuplot -persist < 0) { - if (mpz_tstbit (e, l)) /* k, k+1 -> 2k+1, 2k+2 */ + if (ecm_tstbit (e, l)) /* k, k+1 -> 2k+1, 2k+2 */ { add3 (x0, z0, x0, z0, x1, z1, x, z, n, u, v, w); /* 2k+1 */ duplicate (x1, z1, x1, z1, n, b, u, v, w); /* 2k+2 */ @@ -641,9 +641,9 @@ (x, y) is initial point A is curve parameter in Weierstrass's form: Y^2 = X^3 + A*X + B, where B = y^2-(x^3+A*x) is implicit - when Etype == ECM_EC_TYPE_HESSIAN: + when Etype == ECM_EC_TYPE_TWISTED_HESSIAN: (x, y) is initial point - A is curve parameter in Hessian form: X^3+Y^3+Z^3=3*A*X*Y*Z + A=a/d is curve parameter in Hessian form: a*X^3+Y^3+Z^3=d*X*Y*Z n is the number to factor B1 is the stage 1 bound batch_s = prod(p^e <= B1) if != 1 @@ -663,7 +663,7 @@ mpres_t xB; ell_point_t Q; uint64_t p = 0, r, last_chkpnt_p; - int ret = ECM_NO_FACTOR_FOUND; + int ret = ECM_NO_FACTOR_FOUND, status; long last_chkpnt_time; prime_info_t prime_info; @@ -683,8 +683,7 @@ #endif /* preload group order */ if (go != NULL){ - if (ell_point_mul (Q, go, P, E, n) == 0){ - mpz_set (f, Q->x); + if (ell_point_mul (f, Q, go, P, E, n) == 0){ ret = ECM_FACTOR_FOUND_STEP1; goto end_of_stage1_w; } @@ -698,8 +697,7 @@ outputf (OUTPUT_VERBOSE, "Using traditional approach to Step 1\n"); for (r = 2; r <= B1; r *= 2) if (r > *B1done){ - if(ell_point_duplicate (Q, P, E, n) == 0){ - mpz_set(f, Q->x); + if(ell_point_duplicate (f, Q, P, E, n) == 0){ ret = ECM_FACTOR_FOUND_STEP1; goto end_of_stage1_w; } @@ -713,11 +711,23 @@ last_chkpnt_p = 3; for (p = getprime_mt (prime_info); p <= B1; p = getprime_mt (prime_info)){ - mpz_set_ui(f, (ecm_uint)p); for (r = p; r <= B1; r *= p){ if (r > *B1done){ - if(ell_point_mul (Q, f, P, E, n) == 0){ - mpz_set(f, Q->x); + mpz_set_ui(f, (ecm_uint)p); + status = ell_point_mul (f, Q, f, P, E, n); + if(status == 0){ + } + else if(E->law == ECM_LAW_HOMOGENEOUS){ + if(E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + mpres_gcd(f, Q->x, n); + else + mpres_gcd(f, Q->z, n); + // gmp_printf("gcd=%Zd\n", f); + if(mpz_cmp(f, n->orig_modulus) < 0 + && mpz_cmp_ui(f, 1) > 0) + status = 0; + } + if(status == 0){ ret = ECM_FACTOR_FOUND_STEP1; goto end_of_stage1_w; } @@ -751,18 +761,17 @@ } else{ #if USE_ADD_SUB_CHAINS == 0 /* keeping it simple */ - if (ell_point_mul (Q, batch_s, P, E, n) == 0){ - mpz_set (f, Q->x); + if (ell_point_mul (f, Q, batch_s, P, E, n) == 0){ ret = ECM_FACTOR_FOUND_STEP1; goto end_of_stage1_w; } #else /* batch mode and special coding... */ short *S = NULL; - int w, iS; + size_t iS; + int w; add_sub_unpack(&w, &S, &iS, batch_s); - if (ell_point_mul_add_sub_with_S(Q, P, E, n, w, S, iS) == 0){ - mpz_set (f, Q->x); + if (ell_point_mul_add_sub_with_S(f, Q, P, E, n, w, S, iS) == 0){ ret = ECM_FACTOR_FOUND_STEP1; } #endif @@ -781,7 +790,9 @@ if (chkfilename != NULL) writechkfile (chkfilename, ECM_ECM, *B1done, n, E->a4, P->x, P->y,P->z); prime_info_clear (prime_info); - +#if DEBUG_EC_W >= 2 + printf("lastP="); ell_point_print(P, E, n); printf("\n"); +#endif if(ret != ECM_FACTOR_FOUND_STEP1){ if(ell_point_is_zero(P, E, n) == 1){ /* too bad */ @@ -865,7 +876,7 @@ j += sprintf (outs + j, "%u%c", i, (i < DIGITS_END) ? '\t' : '\n'); outs[j] = '\0'; outputf (OUTPUT_VERBOSE, "Expected number of curves to find a factor " - "of n digits:\n%s", outs); + "of n digits (assuming one exists):\n%s", outs); for (i = DIGITS_START; i <= DIGITS_END; i += DIGITS_INCR) { sep = (i < DIGITS_END) ? '\t' : '\n'; @@ -945,13 +956,11 @@ /* y should be NULL for P+1, and P-1, it contains the y coordinate for the Weierstrass form for ECM (when sigma_is_A = -1). */ -/* if gpu != 0 then it contains the number of curves that will be computed on - the GPU */ void print_B1_B2_poly (int verbosity, int method, double B1, double B1done, mpz_t B2min_param, mpz_t B2min, mpz_t B2, int S, mpz_t sigma, int sigma_is_A, int Etype, - mpz_t y, int param, unsigned int gpu) + mpz_t y, int param, unsigned int nb_curves) { ASSERT ((method == ECM_ECM) || (y == NULL)); ASSERT ((-1 <= sigma_is_A) && (sigma_is_A <= 1)); @@ -981,13 +990,13 @@ outputf (verbosity, ", A=%Zd", sigma); else if (sigma_is_A == 0) { - if (gpu) /* if not 0, contains number_of_curves */ + if (nb_curves > 1) { outputf (verbosity, ", sigma=%d:%Zd", param, sigma); - mpz_add_ui (sigma, sigma, gpu-1); + mpz_add_ui (sigma, sigma, nb_curves-1); outputf (verbosity, "-%d:%Zd", param, sigma); - mpz_sub_ui (sigma, sigma, gpu-1); - outputf (verbosity, " (%u curves)", gpu); + mpz_sub_ui (sigma, sigma, nb_curves-1); + outputf (verbosity, " (%u curves)", nb_curves); } else outputf (verbosity, ", sigma=%d:%Zd", param, sigma); @@ -997,6 +1006,8 @@ outputf (verbosity, ", Weierstrass(A=%Zd,y=%Zd)", sigma, y); else if (Etype == ECM_EC_TYPE_HESSIAN) outputf (verbosity, ", Hessian(D=%Zd,y=%Zd)", sigma, y); + else if (Etype == ECM_EC_TYPE_TWISTED_HESSIAN) + outputf (verbosity, ", twisted Hessian(y=%Zd)", y); } } else if (ECM_IS_DEFAULT_B1_DONE(B1done)) @@ -1102,7 +1113,7 @@ (x, y) contains the new point at the end of Stage 1. */ int -ecm (mpz_t f, mpz_t x, mpz_t y, int *param, mpz_t sigma, mpz_t n, mpz_t go, +ecm (mpz_t f, mpz_t x, mpz_t y, int param, mpz_t sigma, mpz_t n, mpz_t go, double *B1done, double B1, mpz_t B2min_parm, mpz_t B2_parm, unsigned long k, const int S, int verbose, int repr, int nobase2step2, int use_ntt, int sigma_is_A, ell_curve_t zE, @@ -1149,7 +1160,7 @@ #endif /* if a batch mode is requested by the user, this implies ECM_MOD_MODMULN */ - if (repr == ECM_MOD_DEFAULT && IS_BATCH_MODE(*param)) + if (repr == ECM_MOD_DEFAULT && IS_BATCH_MODE(param)) repr = ECM_MOD_MODMULN; /* choose the arithmetic used before the parametrization, since for divisors @@ -1160,26 +1171,32 @@ repr = modulus->repr; /* If the parametrization is not given, choose it. */ - if (*param == ECM_PARAM_DEFAULT) - *param = get_default_param (sigma_is_A, *B1done, repr); + if (param == ECM_PARAM_DEFAULT) + param = get_default_param (sigma_is_A, *B1done, repr); + /* when dealing with several input numbers, if we had already computed + batch_s, but the new number uses the base-2 representation, then we + are forced to use ECM_PARAM_SUYAMA, and we reset batch_s to 1 to avoid + the error "-bsaves/-bloads makes sense in batch mode only" below */ + if (param == ECM_PARAM_SUYAMA) + mpz_set_ui (batch_s, 1); /* In batch mode, we force repr=MODMULN, B1done should be either the default value or greater than B1 x should be either 0 (undetermined) or 2 */ - if (IS_BATCH_MODE(*param)) + if (IS_BATCH_MODE(param)) { if (repr != ECM_MOD_MODMULN) { outputf (OUTPUT_ERROR, "Error, with param %d, repr should be " - "ECM_MOD_MODMULN.\n", *param); + "ECM_MOD_MODMULN.\n", param); return ECM_ERROR; } if (!ECM_IS_DEFAULT_B1_DONE(*B1done) && *B1done < B1) { outputf (OUTPUT_ERROR, "Error, cannot resume with param %d, except " - "for doing only stage 2\n", *param); + "for doing only stage 2\n", param); return ECM_ERROR; } @@ -1195,7 +1212,7 @@ } /* check that if ECM_PARAM_BATCH_SQUARE is used, GMP_NUMB_BITS == 64 */ - if (*param == ECM_PARAM_BATCH_SQUARE && GMP_NUMB_BITS == 32) + if (param == ECM_PARAM_BATCH_SQUARE && GMP_NUMB_BITS == 32) { outputf (OUTPUT_ERROR, "Error, parametrization ECM_PARAM_BATCH_SQUARE " "works only with GMP_NUMB_BITS=64\n"); @@ -1210,29 +1227,6 @@ return ECM_ERROR; } - /* loading stage 1 exponent makes sense only in batch mode */ - if (!IS_BATCH_MODE(*param) && mpz_cmp_ui (batch_s, 1) > 0) - { - fprintf (stderr, "Error, -bsaves/-bloads makes sense in batch mode only\n"); - exit (EXIT_FAILURE); - } - - /* Compute s for the batch mode */ - if (IS_BATCH_MODE(*param) && ECM_IS_DEFAULT_B1_DONE(*B1done) && - (B1 != *batch_last_B1_used || mpz_cmp_ui (batch_s, 1) <= 0)) - { - *batch_last_B1_used = B1; - - st = cputime (); - /* construct the batch exponent */ - compute_s (batch_s, B1, NULL); - outputf (OUTPUT_VERBOSE, "Computing batch product (of %" PRIu64 - " bits) of primes up to B1=%1.0f took %ldms\n", - mpz_sizeinbase (batch_s, 2), B1, cputime () - st); - } - - st = cputime (); - /* See what kind of number we have as that may influence optimal parameter selection. Test for base 2 number. Note: this was already done by mpmod_init. */ @@ -1277,7 +1271,7 @@ { if (mpz_sgn (sigma) == 0) { - youpi = get_curve_from_random_parameter (f, P.A, P.x, sigma, *param, + youpi = get_curve_from_random_parameter (f, P.A, P.x, sigma, param, modulus, rng); if (youpi == ECM_ERROR) @@ -1288,13 +1282,13 @@ } else /* Compute A and x0 from given sigma values */ { - if (*param == ECM_PARAM_SUYAMA) + if (param == ECM_PARAM_SUYAMA) youpi = get_curve_from_param0 (f, P.A, P.x, sigma, modulus); - else if (*param == ECM_PARAM_BATCH_SQUARE) + else if (param == ECM_PARAM_BATCH_SQUARE) youpi = get_curve_from_param1 (P.A, P.x, sigma, modulus); - else if (*param == ECM_PARAM_BATCH_2) + else if (param == ECM_PARAM_BATCH_2) youpi = get_curve_from_param2 (f, P.A, P.x, sigma, modulus); - else if (*param == ECM_PARAM_BATCH_32BITS_D) + else if (param == ECM_PARAM_BATCH_32BITS_D) youpi = get_curve_from_param3 (P.A, P.x, sigma, modulus); else { @@ -1323,7 +1317,7 @@ /* Except for batch mode where we know that x0=2 */ if (mpz_sgn (x) == 0) { - if (IS_BATCH_MODE(*param)) + if (IS_BATCH_MODE(param)) mpres_set_ui (P.x, 2, modulus); else { @@ -1340,7 +1334,7 @@ /* Print B1, B2, polynomial and sigma */ print_B1_B2_poly (OUTPUT_NORMAL, ECM_ECM, B1, *B1done, B2min_parm, B2min, B2, root_params.S, sigma, sigma_is_A, E->type, - y, *param, 0); + y, param, 0); #if 0 outputf (OUTPUT_VERBOSE, "b2=%1.0f, dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n", @@ -1357,7 +1351,7 @@ on some special curves. */ { - if (*param != ECM_PARAM_TORSION) + if (param != ECM_PARAM_TORSION) { mpres_set_z (P.A, sigma, modulus); /* sigma contains A */ mpres_set_z (P.x, x, modulus); @@ -1407,7 +1401,7 @@ outputf (OUTPUT_VERBOSE, "Can't compute success probabilities for B1 <> B2min\n"); } - else if (*param == ECM_PARAM_DEFAULT) + else if (param == ECM_PARAM_DEFAULT) { outputf (OUTPUT_VERBOSE, "Can't compute success probabilities " "for this parametrization.\n"); @@ -1415,14 +1409,31 @@ else { rhoinit (256, 10); - print_expcurves (B1, B2, dF, k, root_params.S, *param); + print_expcurves (B1, B2, dF, k, root_params.S, param); } } + /* Compute s for the batch mode */ + if (IS_BATCH_MODE(param) && ECM_IS_DEFAULT_B1_DONE(*B1done) && + (B1 != *batch_last_B1_used || mpz_cmp_ui (batch_s, 1) <= 0)) + { + *batch_last_B1_used = B1; + + st = cputime (); + /* construct the batch exponent */ + compute_s (batch_s, B1, NULL); + outputf (OUTPUT_VERBOSE, "Computing batch product (of %" PRIu64 + " bits) of primes up to B1=%1.0f took %ldms\n", + mpz_sizeinbase (batch_s, 2), B1, + elltime (st, cputime ())); + } + + st = cputime (); + #ifdef HAVE_GWNUM /* We will only use GWNUM for numbers of the form k*b^n+c */ - if (gw_b != 0 && B1 >= *B1done && *param == ECM_PARAM_SUYAMA) + if (gw_b != 0 && B1 >= *B1done && param == ECM_PARAM_SUYAMA) youpi = gw_ecm_stage1 (f, &P, modulus, B1, B1done, go, gw_k, gw_b, gw_n, gw_c); /* At this point B1 == *B1done unless interrupted, or no GWNUM ecm_stage1 @@ -1434,10 +1445,10 @@ if (B1 > *B1done || mpz_cmp_ui (go, 1) > 0) { - if (IS_BATCH_MODE(*param)) + if (IS_BATCH_MODE(param)) /* FIXME: go, stop_asap and chkfilename are ignored in batch mode */ youpi = ecm_stage1_batch (f, P.x, P.A, modulus, B1, B1done, - *param, batch_s); + param, batch_s); else{ #ifdef HAVE_ADDLAWS if(E->type == ECM_EC_TYPE_MONTGOMERY) @@ -1480,7 +1491,9 @@ mpres_get_z (x, P.x, modulus); #ifdef HAVE_ADDLAWS - if (E->type == ECM_EC_TYPE_WEIERSTRASS || E->type == ECM_EC_TYPE_HESSIAN) + if (E->type == ECM_EC_TYPE_WEIERSTRASS + || E->type == ECM_EC_TYPE_HESSIAN + || E->type == ECM_EC_TYPE_TWISTED_HESSIAN) mpres_get_z (y, P.y, modulus); #endif @@ -1551,6 +1564,26 @@ /* due to that non-trivial kernel(?) */ youpi = mult_by_3(f, P.x, P.y, P.A, modulus); } + else if (E->type == ECM_EC_TYPE_TWISTED_HESSIAN) + { + mpz_t c, rm; + mpz_init(c); + mpz_init(rm); + mpres_get_z(rm, E->a4, modulus); + mpz_rootrem(c, rm, rm, 3); + if(mpz_sgn(rm) != 0){ + printf("ECM_EC_TYPE_TWISTED_HESSIAN: not a cube!\n"); + exit(-1); + } + mpres_set_z(P.A, c, modulus); + mpz_clear(c); + mpz_clear(rm); + youpi = twisted_hessian_to_weierstrass (f, P.x, P.y, P.A, E->a6, modulus); + if(youpi == ECM_NO_FACTOR_FOUND){ + /* due to that non-trivial kernel(?) */ + youpi = mult_by_3(f, P.x, P.y, P.A, modulus); + } + } #endif if (test_verbose (OUTPUT_RESVERBOSE) && youpi == ECM_NO_FACTOR_FOUND && @@ -1573,7 +1606,7 @@ if (youpi == ECM_NO_FACTOR_FOUND && mpz_cmp (B2, B2min) >= 0) youpi = stage2 (f, &P, modulus, dF, k, &root_params, use_ntt, - TreeFilename, stop_asap); + TreeFilename, 0, stop_asap); #ifdef TIMING_CRT printf ("mpzspv_from_mpzv_slow: %dms\n", mpzspv_from_mpzv_slow_time); printf ("mpzspv_to_mpzv: %dms\n", mpzspv_to_mpzv_time); @@ -1583,13 +1616,13 @@ end_of_ecm_rhotable: if (test_verbose (OUTPUT_VERBOSE)) { - if (mpz_cmp_d (B2min, B1) == 0 && *param != ECM_PARAM_DEFAULT) + if (mpz_cmp_d (B2min, B1) == 0 && param != ECM_PARAM_DEFAULT) { if (youpi == ECM_NO_FACTOR_FOUND && (stop_asap == NULL || !(*stop_asap)())) print_exptime (B1, B2, dF, k, root_params.S, (long) (stage1time * 1000.) + - elltime (st, cputime ()), *param); + elltime (st, cputime ()), param); rhoinit (1, 0); /* Free memory of rhotable */ } } diff -Nru gmp-ecm-7.0.4+ds/ecm-gpu.h gmp-ecm-7.0.5+ds/ecm-gpu.h --- gmp-ecm-7.0.4+ds/ecm-gpu.h 2016-04-19 14:42:43.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm-gpu.h 2022-06-06 14:16:49.000000000 +0000 @@ -7,6 +7,9 @@ #ifdef WITH_GPU +// Absolute limit of CGBN support +#define ECM_GPU_CGBN_MAX_BITS 32*1024 + #ifndef ECM_GPU_NB_DIGITS #define ECM_GPU_NB_DIGITS 32 //by default #endif @@ -24,7 +27,9 @@ #define VOL volatile //#define VOL -#define ECM_GPU_CURVES_BY_BLOCK 32 +#ifndef ECM_GPU_CURVES_BY_BLOCK + #define ECM_GPU_CURVES_BY_BLOCK 32 +#endif #define ECM_GPU_MAX_BITS ECM_GPU_SIZE_DIGIT * ECM_GPU_NB_DIGITS typedef digit_t VOL biguint_t[ECM_GPU_NB_DIGITS]; @@ -39,10 +44,10 @@ /* cudawrapper.c */ #define gpu_ecm __ECM(gpu_ecm) #ifdef WITH_GPU -int gpu_ecm (mpz_t, mpz_t, int*, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, +int gpu_ecm (mpz_t, mpz_t, int, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, unsigned long, const int, int, int, int, int, int, FILE*, FILE*, char*, char *, double, int (*)(void), mpz_t, - double *, int, int*, unsigned int*); + double *, int, int, int*, unsigned int*); #else int gpu_ecm (); #endif diff -Nru gmp-ecm-7.0.4+ds/ecm.h gmp-ecm-7.0.5+ds/ecm.h --- gmp-ecm-7.0.4+ds/ecm.h 2016-10-11 09:28:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,196 +0,0 @@ -/* ecm.h. Generated from ecm.h.in by configure. */ -/* ecm.h - public interface for libecm. - -Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 -Paul Zimmermann, Alexander Kruppa, David Cleaver, Cyril Bouvier. - -This file is part of the ECM Library. - -The ECM Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Lesser General Public License as published by -the Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -The ECM Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -License for more details. - -You should have received a copy of the GNU Lesser General Public License -along with the ECM Library; see the file COPYING.LIB. If not, see -http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ - -#ifndef _ECM_H -#define _ECM_H 1 - -#include /* for FILE */ -#include - -#define ECM_VERSION "7.0.4" - -#ifdef __cplusplus -extern "C" { -#endif - -#define EC_W_NBUFS 9 /* for Hessian form */ - -/* More ec forms */ -#define ECM_EC_TYPE_MONTGOMERY 1 -#define ECM_EC_TYPE_WEIERSTRASS 2 -#define ECM_EC_TYPE_HESSIAN 3 -#define ECM_EC_TYPE_WEIERSTRASS_COMPLETE 4 - -/* which type of law used */ -#define ECM_LAW_AFFINE 1 -#define ECM_LAW_HOMOGENEOUS 2 - -typedef struct -{ - int type; - int law; - mpz_t a4; /* for MONTGOMERY: b*y^2=x^3+A*x^2+x - for WEIERSTRASS: y^2=x^3+A*x+B - for HESSIAN: U^3+V^3+W^3=3*A*U*V*W */ - mpz_t a1, a3, a2, a6; /* for complete WEIERSTRASS */ - mpz_t buf[EC_W_NBUFS]; /* used in the addition laws */ - int disc; /* in case E is known to have CM by Q(sqrt(disc)) */ - mpz_t sq[10]; /* for CM curves, we might have squareroots */ -} __ell_curve_struct; -typedef __ell_curve_struct ell_curve_t[1]; - -typedef struct -{ - mpz_t x; - mpz_t y; - mpz_t z; -} __ell_point_struct; -typedef __ell_point_struct ell_point_t[1]; - -typedef struct -{ - int method; /* factorization method, default is ecm */ - mpz_t x, y; /* starting point (if non zero) */ - int param; /* (ECM only) What parametrization do we used */ - mpz_t sigma; /* (ECM only) The parameter for the parametrization */ - /* May contains A */ - int sigma_is_A; /* if 1, 'parameter' contains A (Montgomery form), - if 0, 'parameter' contains sigma (Montgomery form), - if -1, 'parameter' contains A, and the input curve is in - Weierstrass form y^2 = x^3 + A*x + B, with y in 'go'. */ - __ell_curve_struct *E; /* the curve, particularly useful for CM ones */ - mpz_t go; /* initial group order to preload (if NULL: do nothing), - or y for Weierstrass form if sigma_is_A = -1. */ - double B1done; /* step 1 was already done up to B1done */ - mpz_t B2min; /* lower bound for stage 2 (default is B1) */ - mpz_t B2; /* step 2 bound (chosen automatically if < 0.0) */ - unsigned long k;/* number of blocks in stage 2 */ - int S; /* degree of the Brent-Suyama's extension for stage 2 */ - int repr; /* representation for modular arithmetic: ECM_MOD_MPZ=mpz, - ECM_MOD_MODMULN=modmuln (Montgomery's quadratic multiplication), - ECM_MOD_REDC=redc (Montgomery's subquadratic multiplication), - ECM_MOD_GWNUM=Woltman's gwnum routines (tbd), - > 16 : special base-2 representation - MOD_DEFAULT: automatic choice */ - int nobase2step2; /* disable special base-2 code in ecm stage 2 only */ - int verbose; /* verbosity level: 0 no output, 1 normal output, - 2 diagnostic output */ - FILE *os; /* output stream (for verbose messages) */ - FILE *es; /* error stream (for error messages) */ - char *chkfilename; /* Filename to write stage 1 checkpoints to */ - char *TreeFilename; /* Base filename for storing product tree of F */ - double maxmem; /* Maximal amount of memory to use in stage 2, in bytes. - 0. means no limit (optimise only for speed) */ - double stage1time; /* Time to add for estimating expected time to find fac.*/ - gmp_randstate_t rng; /* State of random number generator */ - int use_ntt; /* set to 1 to use ntt poly code in stage 2 */ - int (*stop_asap) (void); /* Pointer to function, if it returns 0, contine - normally, otherwise exit asap. May be NULL */ - /* The batch mode is used for stage 1 when param=1 or param=2)*/ - mpz_t batch_s; /* s is the product of primes up to B1 for batch mode */ - double batch_last_B1_used; /* Last B1 used in batch mode. Used to avoid */ - /* computing s when B1 = batch_last_B1_used */ - int gpu; /* do we use the GPU for stage 1. */ - /* If different from 0, the GPU is used */ - /* Else, the parameters beginning by gpu_* have no meaning */ - int gpu_device; /* Which device do we use */ - int gpu_device_init; /* Is the device initialized?*/ - unsigned int gpu_number_of_curves; - double gw_k; /* use for gwnum stage 1 if input has form k*b^n+c */ - unsigned long gw_b; /* use for gwnum stage 1 if input has form k*b^n+c */ - unsigned long gw_n; /* use for gwnum stage 1 if input has form k*b^n+c */ - signed long gw_c; /* use for gwnum stage 1 if input has form k*b^n+c */ -} __ecm_param_struct; -typedef __ecm_param_struct ecm_params[1]; -typedef __ecm_param_struct *ecm_params_ptr; - -#define ECM_MOD_NOBASE2 -1 -#define ECM_MOD_DEFAULT 0 -#define ECM_MOD_MPZ 1 -#define ECM_MOD_BASE2 2 -#define ECM_MOD_MODMULN 3 -#define ECM_MOD_REDC 4 -/* values <= -16 or >= 16 have a special meaning */ - -const char *ecm_version(); -int ecm_factor (mpz_t, mpz_t, double, ecm_params); -void ecm_init (ecm_params); -void ecm_clear (ecm_params); - -/* the following interface is not supported */ -int ecm (mpz_t, mpz_t, mpz_t, int*, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, - unsigned long, int, int, int, int, int, int, - ell_curve_t, FILE* os, FILE* es, - char*, char *, double, double, gmp_randstate_t, int (*)(void), mpz_t, - double *, double, unsigned long, unsigned long, signed long); -int pp1 (mpz_t, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, - unsigned long, int, int, int, FILE*, FILE*, char*, - char *, double, gmp_randstate_t, int (*)(void)); -int pm1 (mpz_t, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, - mpz_t, unsigned long, int, int, int, FILE*, - FILE*, char *, char*, double, gmp_randstate_t, int (*)(void)); - -/* different methods implemented */ -#define ECM_ECM 0 -#define ECM_PM1 1 -#define ECM_PP1 2 - -/* return value of ecm, pm1, pp1 */ -#define ECM_FACTOR_FOUND_STEP1 1 /* should be positive */ -#define ECM_FACTOR_FOUND_STEP2 2 /* should be positive */ -#define ECM_NO_FACTOR_FOUND 0 /* should be zero */ -#define ECM_ERROR -1 /* should be non-zero */ -#define ECM_FACTOR_FOUND_P(x) ((x) > 0) -#define ECM_ERROR_P(x) ((x) < 0) - -#define ECM_DEFAULT_B1_DONE 1.0 -#define ECM_IS_DEFAULT_B1_DONE(x) (x <= 1.0) - -/* Different parametrizations used in stage 1 of ECM */ -#define ECM_PARAM_DEFAULT -1 -#define ECM_PARAM_SUYAMA 0 -#define ECM_PARAM_BATCH_SQUARE 1 -#define ECM_PARAM_BATCH_2 2 -#define ECM_PARAM_BATCH_32BITS_D 3 -/* we keep 4 as spare */ -#define ECM_PARAM_WEIERSTRASS 5 -#define ECM_PARAM_HESSIAN 6 -#define ECM_PARAM_TORSION 7 - -/* stage 2 bound */ -#define ECM_DEFAULT_B2 -1 -#define ECM_IS_DEFAULT_B2(x) (mpz_cmp_si (x, ECM_DEFAULT_B2) == 0) - -#define ECM_DEFAULT_K 0 /* default number of blocks in stage 2. 0 = automatic - choice */ -#define ECM_DEFAULT_S 0 /* polynomial is chosen automatically */ - -/* Apple uses '\r' for newlines */ -#define IS_NEWLINE(c) (((c) == '\n') || ((c) == '\r')) - -#ifdef __cplusplus -} -#endif - -#endif /* _ECM_H */ - diff -Nru gmp-ecm-7.0.4+ds/ecm.h.in gmp-ecm-7.0.5+ds/ecm.h.in --- gmp-ecm-7.0.4+ds/ecm.h.in 2016-02-26 07:40:37.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm.h.in 2022-06-06 14:16:49.000000000 +0000 @@ -32,13 +32,14 @@ extern "C" { #endif -#define EC_W_NBUFS 9 /* for Hessian form */ +#define EC_W_NBUFS 10 /* for twisted Hessian form */ /* More ec forms */ #define ECM_EC_TYPE_MONTGOMERY 1 #define ECM_EC_TYPE_WEIERSTRASS 2 #define ECM_EC_TYPE_HESSIAN 3 -#define ECM_EC_TYPE_WEIERSTRASS_COMPLETE 4 +#define ECM_EC_TYPE_TWISTED_HESSIAN 4 +#define ECM_EC_TYPE_WEIERSTRASS_COMPLETE 5 /* which type of law used */ #define ECM_LAW_AFFINE 1 @@ -50,7 +51,9 @@ int law; mpz_t a4; /* for MONTGOMERY: b*y^2=x^3+A*x^2+x for WEIERSTRASS: y^2=x^3+A*x+B - for HESSIAN: U^3+V^3+W^3=3*A*U*V*W */ + for HESSIAN: U^3+V^3+W^3=3*A*U*V*W + for TWISTED_HESSIAN: a*X^3+Y^3+Z^3=d*X*Y*Z + */ mpz_t a1, a3, a2, a6; /* for complete WEIERSTRASS */ mpz_t buf[EC_W_NBUFS]; /* used in the addition laws */ int disc; /* in case E is known to have CM by Q(sqrt(disc)) */ @@ -112,6 +115,7 @@ int gpu; /* do we use the GPU for stage 1. */ /* If different from 0, the GPU is used */ /* Else, the parameters beginning by gpu_* have no meaning */ + int gpu_cgbn; /* If CGBN should be used for GPU stage 1 computation */ int gpu_device; /* Which device do we use */ int gpu_device_init; /* Is the device initialized?*/ unsigned int gpu_number_of_curves; @@ -134,10 +138,11 @@ const char *ecm_version(); int ecm_factor (mpz_t, mpz_t, double, ecm_params); void ecm_init (ecm_params); +void ecm_reset (ecm_params); void ecm_clear (ecm_params); /* the following interface is not supported */ -int ecm (mpz_t, mpz_t, mpz_t, int*, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, +int ecm (mpz_t, mpz_t, mpz_t, int, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, unsigned long, int, int, int, int, int, int, ell_curve_t, FILE* os, FILE* es, char*, char *, double, double, gmp_randstate_t, int (*)(void), mpz_t, @@ -172,9 +177,10 @@ #define ECM_PARAM_BATCH_2 2 #define ECM_PARAM_BATCH_32BITS_D 3 /* we keep 4 as spare */ -#define ECM_PARAM_WEIERSTRASS 5 -#define ECM_PARAM_HESSIAN 6 -#define ECM_PARAM_TORSION 7 +#define ECM_PARAM_WEIERSTRASS 5 +#define ECM_PARAM_HESSIAN 6 +#define ECM_PARAM_TWISTED_HESSIAN 7 +#define ECM_PARAM_TORSION 8 /* stage 2 bound */ #define ECM_DEFAULT_B2 -1 diff -Nru gmp-ecm-7.0.4+ds/ecm-impl.h gmp-ecm-7.0.5+ds/ecm-impl.h --- gmp-ecm-7.0.4+ds/ecm-impl.h 2016-04-08 12:45:49.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm-impl.h 2022-06-06 14:16:49.000000000 +0000 @@ -39,9 +39,9 @@ #endif /* We do not use torsion.[ch] so far since they are not tested enough. */ -/* #define HAVE_TORSION */ +#define HAVE_TORSION /* We do not use addlaws.[ch] so far since they are not tested enough. */ -/* #define HAVE_ADDLAWS */ +#define HAVE_ADDLAWS #include "ecm_int.h" @@ -361,8 +361,8 @@ /* stage2.c */ #define stage2 __ECM(stage2) -int stage2 (mpz_t, void *, mpmod_t, unsigned long, unsigned long, - root_params_t *, int, char *, int (*)(void)); +int stage2 (mpz_t, void *, mpmod_t, unsigned long, unsigned long, + root_params_t *, int, char *, unsigned int, int (*)(void)); #define init_progression_coeffs __ECM(init_progression_coeffs) listz_t init_progression_coeffs (mpz_t, const unsigned long, const unsigned long, const unsigned int, const unsigned int, @@ -614,6 +614,10 @@ double ecmprob (double, double, double, double, int); double pm1prob (double, double, double, double, int, const mpz_t); +/* pm1.c */ +void print_prob (double, const mpz_t, unsigned long, unsigned long, int, + const mpz_t); + /* auxlib.c */ #define mpz_add_si __ECM(mpz_add_si) void mpz_add_si (mpz_t, mpz_t, long); @@ -639,6 +643,16 @@ void writechkfile (char *, int, double, mpmod_t, mpres_t, mpres_t, mpres_t, mpres_t); #define aux_fseek64 __ECM(aux_fseek64) int aux_fseek64(FILE *, const int64_t, const int); +#define ecm_tstbit __ECM(ecm_tstbit) +int ecm_tstbit (mpz_srcptr, ecm_uint); + +/* Due to GMP (6.x and prior) using long as input to mpz_tstbit, factors would be missed + on computers with 32-bit longs in batch mode when using B1 > 2977044736UL. + So, we need to use our own function when long is not 64-bits wide */ +#if ULONG_MAX == 0xffffffffUL +#undef mpz_tstbit +#define mpz_tstbit ecm_tstbit +#endif /* auxarith.c */ #define gcd __ECM(gcd) diff -Nru gmp-ecm-7.0.4+ds/ecmprob.magma gmp-ecm-7.0.5+ds/ecmprob.magma --- gmp-ecm-7.0.4+ds/ecmprob.magma 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecmprob.magma 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,77 @@ +SetVerbose("MPQS",false); // disable MPQS messages (if any) + +FindGroupOrder := function (p, sigma) + K := GF(p); + v := K ! (4*sigma); + u := K ! (sigma^2-5); + x := u^3; + b := 4*x*v; + a := (v-u)^3*(3*u+v); + A := a/b-2; + x := x/v^3; + b := x^3 + A*x^2 + x; + E := EllipticCurve([0,b*A,0,b^2,0]); + return FactoredOrder(E); +end function; + +// ecmprob(20, 11000, 1583400, 1000, 6); +// sigma=6: n= 1000 e2= 3.269 e3= 1.696 e5= 0.281 success= 13/1000 +// sigma=7: n= 1000 e2= 3.304 e3= 1.656 e5= 0.29 success= 11/1000 +// sigma=1/4: n= 1000 e2= 3.327 e3= 1.687 e5= 0.298 success= 3/200 +// ecmprob(25, 50000, 14201460, 1000, 6); +// sigma=6: n= 1000 e2= 3.353 e3= 1.665 e5= 0.319 success= 3/1000 +// sigma=7: n= 1000 e2= 3.293 e3= 1.662 e5= 0.299 success= 1/250 +// sigma=1/4: n= 1000 e2= 3.374 e3= 1.646 e5= 0.291 success= 3/500 +// ecmprob(30, 250000, 173213040, 1000, 6); +// sigma=6: n= 1000 e2= 3.369 e3= 1.655 e5= 0.314 success= 1/250 +// sigma=7: n= 1000 e2= 3.219 e3= 1.676 e5= 0.308 success= 1/500 +// sigma=1/4: n= 1000 e2= 3.373 e3= 1.678 e5= 0.287 success= 3/1000 +// ecmprob(35, 1000000, 975508380, 1000, 6); +// sigma=6: n= 1000 e2= 3.39 e3= 1.699 e5= 0.292 success= 3/1000 +// sigma=7: n= 1000 e2= 3.292 e3= 1.665 e5= 0.329 success= 1/1000 +// sigma=1/4: n= 1000 e2= 3.346 e3= 1.675 e5= 0.321 success= 1/1000 +// ecmprob(40, 3000000, 4595040450, 1000, 6); +// sigma=6: n= 1000 e2= 3.391 e3= 1.667 e5= 0.298 success= 1/1000 +// sigma=7: n= 1000 e2= 3.237 e3= 1.685 e5= 0.308 success= 1/1000 +// sigma=1/4: n= 1000 e2= 3.335 e3= 1.657 e5= 0.251 success= 0 +// ecmprob(45, 11000000, 30114892080, 1000, 6); +// sigma=6: n= 1000 e2= 3.244 e3= 1.721 e5= 0.317 success= 0 +// sigma=7: n= 1000 e2= 3.334 e3= 1.668 e5= 0.295 success= 0 +// sigma=1/4: n= 1000 e2= 3.295 e3= 1.647 e5= 0.305 success= 0 +// ecmprob(50, 43000000, 198674155680, 1000, 6); +// sigma=1/4: n= 1000 e2= 3.396 e3= 1.655 e5= 0.267 success= 0 +// ecmprob(55, 110000000, 729516567780, 1000, 6); +// sigma=1/4: n= 1000 e2= 3.217 e3= 1.629 e5= 0.301 success= 0 +ecmprob := procedure (d, B1, B2, N, sigma) +local p, g, n, e2, e3, e5, i, success, l, S, T; + T := RealField(d); + p := Ceiling(Exp((T ! d - T ! 0.5) * Log(T ! 10.0))); + S := RealField(8); + e2 := S ! 0.0; + e3 := S ! 0.0; + e5 := S ! 0.0; + n := 0; // number of primes + success := 0; + for n := 1 to N do + p := NextPrime (p : Proof := false); + g := FindGroupOrder (p, sigma); + i := 1; + l := #g; + if l ge i and g[i][1] eq 2 then + e2 := e2 + g[i][2]; + i := i + 1; + end if; + if l ge i and g[i][1] eq 3 then + e3 := e3 + g[i][2]; + i := i + 1; + end if; + if l ge i and g[i][1] eq 5 then + e5 := e5 + g[i][2]; + i := i + 1; + end if; + if g[l][1] le B2 and (l eq 1 or g[l-1][1] le B1) then + success := success + 1; + end if; + print "n=",n,"e2=",e2/n,"e3=",e3/n,"e5=",e5/n,"success=",success/n; + end for; +end procedure; diff -Nru gmp-ecm-7.0.4+ds/ecm.xml gmp-ecm-7.0.5+ds/ecm.xml --- gmp-ecm-7.0.4+ds/ecm.xml 2016-05-24 15:31:33.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ecm.xml 2022-06-06 14:16:49.000000000 +0000 @@ -687,9 +687,7 @@ BUGS -Report bugs to <ecm-discuss@lists.gforge.inria.fr>, after checking -<http://www.loria.fr/~zimmerma/records/ecmnet.html> for bug fixes -or new versions. +Report bugs on <https://gitlab.inria.fr/zimmerma/ecm/>. diff -Nru gmp-ecm-7.0.4+ds/eval.c gmp-ecm-7.0.5+ds/eval.c --- gmp-ecm-7.0.4+ds/eval.c 2016-02-25 18:11:24.000000000 +0000 +++ gmp-ecm-7.0.5+ds/eval.c 2022-06-06 14:16:49.000000000 +0000 @@ -46,8 +46,15 @@ * Simple Primorial: n# 11# == 2*3*5*7*11 * * Reduced Primorial: n#m 17#5 == 5.7.11.13.17 * * * - * Adding (working on these at least: * - * Phi(x,n) * + * Supported functions: (case insensitive) * + * Phi(n,x) * + * GCD(m,n) * + * U(p,q,n) * + * primU(p,q,n) * + * TODO: PhiL(k,n), PhiM(k,n) * + * only for bases 2,3,5,6,7,10,11 (times a square) * + * Note for developers: * + * First k-1 arguments are passed as an mpz_t array * * * * NOTE Lines ending in a \ character are "joined" * * NOTE C++ // single line comments (rest of line is a comment) * @@ -61,8 +68,14 @@ static void eval_power (mpz_t prior_n, mpz_t n,char op); static void eval_product (mpz_t prior_n, mpz_t n,char op); static void eval_sum (mpz_t prior_n, mpz_t n,char op); -static int eval_Phi (mpz_t prior_n, mpz_t n, int ParamCnt); +static int eval_Phi (mpz_t *params, mpz_t n); +static int eval_PhiL (mpz_t *params, mpz_t n); +static int eval_PhiM (mpz_t *params, mpz_t n); +// static int eval_gcd (mpz_t *params, mpz_t n); +static int eval_U (mpz_t *params, mpz_t n); +static int eval_primU (mpz_t *params, mpz_t n); static int eval_2 (int bInFuncParams); +static int aurif (mpz_t output, mpz_t n, mpz_t base, int sign); #if 0 /* strncasecmp is a required function in configure.in */ #if defined (_MSC_VER) || defined (__MINGW32__) @@ -331,8 +344,9 @@ mpz_sub(n,prior_n,n); } -int eval_Phi (mpz_t b, mpz_t n, int ParamCnt) +int eval_Phi (mpz_t* params, mpz_t n) { + /* params[0]=exp, n=base */ int factors[200]; unsigned dwFactors=0, dw; unsigned long B; @@ -340,40 +354,59 @@ mpz_t D, T, org_n; prime_info_t prime_info; - if (ParamCnt == 0) + /* deal with trivial cases first */ + if (mpz_cmp_ui (params[0], 0) == 0) + { + mpz_set_ui (n, 1); + return 1; + } + if (mpz_cmp_ui (params[0], 0) < 0) return 0; - - if (mpz_cmp_ui (n, 1) == 0) + if (mpz_cmp_ui (params[0], 1) == 0) { - /* return value is 1 if b is composite, or b if b is prime */ - int isPrime = mpz_probab_prime_p (b, PROBAB_PRIME_TESTS); - if (isPrime) - mpz_set (n, b); - else - mpz_set (n, mpOne); + mpz_sub_ui (n, n, 1); return 1; } - if (mpz_cmp_si (n, -1) == 0) - /* this is actually INVALID, but it is easier to simply */ - return 0; - - /* OK parse the Phi out now */ - if (mpz_cmp_ui (b, 0) <= 0) - return 0; - - if (mpz_cmp_ui (b, 1) == 0) + if (mpz_cmp_ui (params[0], 2) == 0) + { + mpz_add_ui (n, n, 1); + return 1; + } + if (mpz_cmp_ui (n, 0) < 0) + /* Convert to positive base; this is always valid when exp>=3 */ + { + mpz_neg (n, n); + if (mpz_congruent_ui_p (params[0], 1, 2)) + { + mpz_mul_ui(params[0], params[0], 2); + } + else if (mpz_congruent_ui_p (params[0], 2, 4)) + { + mpz_divexact_ui(params[0], params[0], 2); + } + } + if (mpz_cmp_ui (n, 1) == 0) { - if (mpz_cmp_ui (n, 1) != 0) - mpz_sub_ui (n, n, 1); + /* return value is p if params[0] is prime power p^k, or 1 otherwise */ + int maxpower=mpz_sizeinbase(params[0], 2)+1; + mpz_init (T); + for (int power=maxpower; power>=1; --power) + { + if ( mpz_root (T, params[0], power) ) break; + } + int isPrime = mpz_probab_prime_p (T, PROBAB_PRIME_TESTS); + mpz_set (n, isPrime ? T : mpOne); + mpz_clear(T); return 1; } + /* Ok, do the real h_primative work, since we are not one of the trivial case */ - if (mpz_fits_ulong_p (b) == 0) + if (mpz_fits_ulong_p (params[0]) == 0) return 0; - B = mpz_get_ui (b); + B = mpz_get_ui (params[0]); /* Obtain the factors of B */ prime_info_init (prime_info); @@ -388,7 +421,7 @@ } } prime_info_clear (prime_info); /* free the prime tables */ - B = mpz_get_si (b); + B = mpz_get_si (params[0]); mpz_init_set (org_n, n); mpz_set_ui (n, 1); @@ -454,20 +487,356 @@ return 1; } +int aurif (mpz_t output, mpz_t n, mpz_t base, int sign) // Evaluate Aurifeullian polynomials +{ + int b,k=mpz_get_ui(n); + mpz_t orig_base; + mpz_t C,D,l,m; + // Find a proper base + mpz_init_set(orig_base,base); + mpz_inits(C,D,l,m,NULL); + for(b=2;b<=11;b++) + { + mpz_set(base,orig_base); + mpz_mul_ui(base,base,b); + if(mpz_perfect_square_p(base)) break; + } + if(b==12) // not found + { + gmp_fprintf (stderr, "Error: base %Zd not supported for Aurifeullian factorization yet\n", orig_base); + return 0; + } + if(k%((b==5)?b:(2*b))!=0) + { + gmp_fprintf (stderr, "Error: exponent %Zd does not make sense for base %Zd\n", n, orig_base); + return 0; + } + k/=((b==5)?b:(2*b)); + if(k%2==0) + { + gmp_fprintf (stderr, "Error: exponent %Zd does not make sense for base %Zd\n", n, orig_base); + return 0; + } + mpz_set(base,orig_base); + mpz_pow_ui(m, base, k); + mpz_mul_ui(l, m, b); + mpz_sqrt(l, l); + switch(b) + { + case 2: + case 3: + mpz_add_ui(C, m, 1); + mpz_set_ui(D, 1); + break; + case 5: + case 6: + mpz_add_ui(C, m, 3); + mpz_mul(C, C, m); + mpz_add_ui(C, C, 1); + mpz_add_ui(D, m, 1); + break; + case 7: + mpz_add_ui(C, m, 1); + mpz_pow_ui(C, C, 3); + mpz_add_ui(D, m, 1); + mpz_mul(D, D, m); + mpz_add_ui(D, D, 1); + break; + case 10: + mpz_add_ui(C, m, 5); + mpz_mul(C, C, m); + mpz_add_ui(C, C, 7); + mpz_mul(C, C, m); + mpz_add_ui(C, C, 5); + mpz_mul(C, C, m); + mpz_add_ui(C, C, 1); + mpz_add_ui(D, m, 2); + mpz_mul(D, D, m); + mpz_add_ui(D, D, 2); + mpz_mul(D, D, m); + mpz_add_ui(D, D, 1); + break; + case 11: + mpz_add_ui(C, m, 5); + mpz_mul(C, C, m); + mpz_sub_ui(C, C, 1); + mpz_mul(C, C, m); + mpz_sub_ui(C, C, 1); + mpz_mul(C, C, m); + mpz_add_ui(C, C, 5); + mpz_mul(C, C, m); + mpz_add_ui(C, C, 1); + mpz_add_ui(D, m, 1); + mpz_mul(D, D, m); + mpz_sub_ui(D, D, 1); + mpz_mul(D, D, m); + mpz_add_ui(D, D, 1); + mpz_mul(D, D, m); + mpz_add_ui(D, D, 1); + break; + default: // not supposed to arrive here + break; + } + mpz_set(output, C); + (sign>0 ? mpz_addmul : mpz_submul)(output, D, l); +// gmp_fprintf(stderr, "Calculated base=%Zd, exp=%Zd, C=%Zd, D=%Zd, output=%Zd\n",base,n,C,D,output); + mpz_clears(orig_base,C,D,l,m,NULL); + return 1; +} +int eval_PhiL (mpz_t *params, mpz_t n) +{ + mpz_t aur; + int err1,err2; + mpz_init(aur); + err1=aurif(aur,params[0],n,-1); + err2=eval_Phi(params,n); // n now holds Phi(params[0],n) + mpz_gcd(n,n,aur); + mpz_clear(aur); + return err1*err2; +} +int eval_PhiM (mpz_t *params, mpz_t n) +{ + mpz_t aur; + int err1,err2; + mpz_init(aur); + err1=aurif(aur,params[0],n,1); + err2=eval_Phi(params,n); // n now holds Phi(params[0],n) + mpz_gcd(n,n,aur); + mpz_clear(aur); + return err1*err2; +} + +int eval_gcd (mpz_t *params, mpz_t n) +{ + mpz_gcd(n, n, params[0]); + return 1; +} + +int eval_U (mpz_t *params, mpz_t n) +/* params[0]=P, params[1]=Q */ +{ + unsigned long N; + mpz_t U1,U0,org_n,D,T; /* At each step U1 holds U(k), and U0 holds U(k-1) */ + long k,l; + + if (mpz_cmp_si (n, 0) < 0) + return 0; + if (mpz_cmp_ui (n, 1) == 0) + { + mpz_set_ui (n, 1); + return 1; + } + if (mpz_cmp_ui (n, 0) == 0) + { + mpz_set_ui (n, 0); + return 1; + } + if (mpz_fits_ulong_p (n) == 0) + return 0; + + N = mpz_get_ui (n); + if (mpz_cmp_ui (params[0], 0) == 0) + { + if( N%2==0 ) + { + mpz_set_ui (n, 0); + } + else + { + mpz_neg (params[1], params[1]); + mpz_pow_ui (n, params[1], (N-1)/2); + mpz_neg (params[1], params[1]); + } + return 1; + } + + + mpz_init_set (org_n, n); + mpz_init_set_ui (U1, 1); + mpz_init_set_ui (U0, 0); + mpz_init (D); + mpz_init (T); + mpz_mul (D, params[0], params[0]); + mpz_submul_ui (D, params[1], 4); + k=1; + + for(l=mpz_sizeinbase(org_n,2)-2;l>=0;l--) + { + mpz_mul (U0, U0, U0); + mpz_mul (U1, U1, U1); + mpz_mul (U0, U0, params[1]); + mpz_sub (U0, U1, U0); // U(2k-1)=U(k)^2-QU(k-1)^2 + mpz_pow_ui (T, params[1], k); + mpz_mul (U1, U1, D); + mpz_addmul_ui (U1, T, 2); + mpz_addmul (U1, params[1], U0); // U(2k+1)=DU(k)^2+2Q^k+QU(2k-1) + if (mpz_tstbit (org_n, l) ) + { + k=2*k+1; + mpz_mul (U0,U0,params[1]); // U0 is 2k, U1 is 2k+1 + mpz_add (U0,U1,U0); + mpz_divexact (U0,U0,params[0]); + } + else + { + k=2*k; + mpz_addmul (U1,U0,params[1]); // U0 is 2k-1, U1 is 2k + mpz_divexact (U1,U1,params[0]); + } + /* gmp_printf("%d %Zd %Zd\n",k,U0,U1); */ + } + mpz_set(n, U1); + + mpz_clear(U0); + mpz_clear(U1); + mpz_clear(org_n); + mpz_clear(D); + mpz_clear(T); + + return 1; +} + +int eval_primU (mpz_t* params, mpz_t n) +{ + int factors[200]; + unsigned dwFactors=0, dw; + unsigned long N; + unsigned long p; + mpz_t D, T; + + if (mpz_cmp_ui (n, 0) <= 0) + return 0; + if (mpz_cmp_ui (n, 1) == 0) + { + mpz_set_ui (n, 1); + return 1; + } + + /* Ignore the special cases where P^2=0,Q or 4Q*/ + if (mpz_cmp_ui (params[0], 0) == 0) + { + return 0; + } + mpz_init(D); + mpz_mul(D, params[0], params[0]); + if (mpz_cmp (D, params[1]) == 0) + { + return 0; + } + mpz_submul_ui(D, params[1], 4); + if (mpz_cmp_ui (D, 0) == 0) + { + return 0; + } + + + if (mpz_fits_ulong_p (n) == 0) + return 0; + + N = mpz_get_ui (n); + + /* Obtain the factors of N */ + for (p = 2; p <= N; p++) + { + if (N % p == 0) + { + /* Add the factor one time */ + factors[dwFactors++] = p; + /* but be sure to totally remove it */ + do { N /= p; } while (N % p == 0); + } + } + + + N = mpz_get_ui (n); + + mpz_set_ui (n, 1); + mpz_set_ui (D, 1); + mpz_init (T); + + for(dw=0;(dw<(1U<gw_c = 0; } +/* function to be called between two calls of ecm_factor, it the same + ecm_params q is reused */ +void +ecm_reset (ecm_params q) +{ + mpz_set_ui (q->sigma, 0); + q->B1done = ECM_DEFAULT_B1_DONE; + mpz_set_ui (q->x, 0); +} + void ecm_clear (ecm_params q) { @@ -121,6 +131,7 @@ } else if (mpz_cmp_ui (n, 1) == 0) { + /* we consider n=1 is fully factored (thus in step 1) */ mpz_set_ui (f, 1); return ECM_FACTOR_FOUND_STEP1; } @@ -144,7 +155,7 @@ if (p->gpu == 0) { #endif - res = ecm (f, p->x, p->y, &(p->param), p->sigma, n, p->go, + res = ecm (f, p->x, p->y, p->param, p->sigma, n, p->go, &(p->B1done), B1, p->B2min, p->B2, p->k, p->S, p->verbose, p->repr, p->nobase2step2, p->use_ntt, @@ -157,13 +168,13 @@ } else { - res = gpu_ecm (f, p->x, &(p->param), p->sigma, n, p->go, + res = gpu_ecm (f, p->x, p->param, p->sigma, n, p->go, &(p->B1done), B1, p->B2min, p->B2, p->k, p->S, p->verbose, p->repr, p->nobase2step2, p->use_ntt, p->sigma_is_A, p->os, p->es, p->chkfilename, p->TreeFilename, p->maxmem, p->stop_asap, p->batch_s, &(p->batch_last_B1_used), - p->gpu_device, &(p->gpu_device_init), + p->gpu_cgbn, p->gpu_device, &(p->gpu_device_init), &(p->gpu_number_of_curves)); } #endif diff -Nru gmp-ecm-7.0.4+ds/Fgw.c gmp-ecm-7.0.5+ds/Fgw.c --- gmp-ecm-7.0.4+ds/Fgw.c 2012-03-16 10:08:26.000000000 +0000 +++ gmp-ecm-7.0.5+ds/Fgw.c 2022-06-06 14:16:49.000000000 +0000 @@ -50,14 +50,6 @@ return; } -static int -sgn (const int i) -{ - if (i == 0) - return 0; - return i > 0 ? 1 : -1; -} - /* With the following 2 functions, we try to find a representation of an input number in the form of z = k*b^n+c. If such a representation was found, set the the appropriate values and return 1. Otherwise, set b to @@ -67,8 +59,6 @@ int kbnc_z (double *k, unsigned long *b, unsigned long *n, signed long *c, mpz_t z) { - int i = 0; - int j = 0; int exp = 1; int check_it_out = 0; int ret = 0; @@ -83,7 +73,6 @@ mpz_t base; mpz_t base_min; mpz_t base_max; - unsigned long test_k_ui = 0; /* this puts a bound on how large our C value can be */ int max_diff = 8388607; @@ -229,7 +218,7 @@ kbnc_str (double *k, unsigned long *b, unsigned long *n, signed long *c, char *z, mpz_t num) { - int i = 0; + unsigned long i = 0; int total = 0; char strk[11]; char strb[11]; @@ -313,7 +302,10 @@ mpz_sub_ui (tmp, tmp, (*c * -1)); if (mpz_divisible_p (tmp, num)) - return 1; + { + mpz_clear(tmp); + return 1; + } } /* set b to zero so users have a second way to know we didn't find k,b,n,c */ @@ -323,14 +315,15 @@ return 0; } -/* this method doesn't care if v is 32 or 64 bits... */ +/* return ceil(log(v)/log(2)) */ unsigned long gw_log_2(unsigned long v) { - unsigned long r = 0; /* r will be lg(v) */ + unsigned long r = 0; - while (v >>= 1) + while (v > 1) { r++; + v = (v + 1) / 2; } return r; @@ -394,7 +387,7 @@ B1, &gw_B1done, PTR(gw_A), ABSIZ(gw_A), PTR(gw_x), &siz_x, PTR(gw_z), &siz_z, NULL, 0); #endif - + /* Test that not more was written to gw_x and gw_z than we had space for */ ASSERT_ALWAYS (siz_x <= (unsigned long) ALLOC(gw_x)); ASSERT_ALWAYS (siz_z <= (unsigned long) ALLOC(gw_z)); diff -Nru gmp-ecm-7.0.4+ds/generic/params00.h gmp-ecm-7.0.5+ds/generic/params00.h --- gmp-ecm-7.0.4+ds/generic/params00.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/generic/params00.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,53 @@ +#ifndef MPZMOD_THRESHOLD +#define MPZMOD_THRESHOLD 170 +#endif + +#ifndef REDC_THRESHOLD +#define REDC_THRESHOLD 294 +#endif + +#ifndef MPN_MUL_LO_THRESHOLD_TABLE +#define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} +#endif + +#ifndef NTT_GFP_TWIDDLE_DIF_BREAKOVER +#define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 +#endif + +#ifndef NTT_GFP_TWIDDLE_DIT_BREAKOVER +#define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 +#endif + +#ifndef MUL_NTT_THRESHOLD +#define MUL_NTT_THRESHOLD 1024 +#endif + +#ifndef PREREVERTDIVISION_NTT_THRESHOLD +#define PREREVERTDIVISION_NTT_THRESHOLD 64 +#endif + +#ifndef POLYINVERT_NTT_THRESHOLD +#define POLYINVERT_NTT_THRESHOLD 512 +#endif + +#ifndef POLYEVALT_NTT_THRESHOLD +#define POLYEVALT_NTT_THRESHOLD 512 +#endif + +#ifndef MPZSPV_NORMALISE_STRIDE +#define MPZSPV_NORMALISE_STRIDE 512 +#endif + +/* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ +#ifndef TUNE_MULREDC_TABLE +#define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +#endif + +/* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ +#ifndef TUNE_SQRREDC_TABLE +#define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +#endif + +#ifndef LIST_MUL_TABLE +#define LIST_MUL_TABLE {0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3} +#endif diff -Nru gmp-ecm-7.0.4+ds/generic/params11.h gmp-ecm-7.0.5+ds/generic/params11.h --- gmp-ecm-7.0.4+ds/generic/params11.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/generic/params11.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,53 @@ +#ifndef MPZMOD_THRESHOLD +#define MPZMOD_THRESHOLD 170 +#endif + +#ifndef REDC_THRESHOLD +#define REDC_THRESHOLD 294 +#endif + +#ifndef MPN_MUL_LO_THRESHOLD_TABLE +#define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} +#endif + +#ifndef NTT_GFP_TWIDDLE_DIF_BREAKOVER +#define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 +#endif + +#ifndef NTT_GFP_TWIDDLE_DIT_BREAKOVER +#define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 +#endif + +#ifndef MUL_NTT_THRESHOLD +#define MUL_NTT_THRESHOLD 1024 +#endif + +#ifndef PREREVERTDIVISION_NTT_THRESHOLD +#define PREREVERTDIVISION_NTT_THRESHOLD 64 +#endif + +#ifndef POLYINVERT_NTT_THRESHOLD +#define POLYINVERT_NTT_THRESHOLD 512 +#endif + +#ifndef POLYEVALT_NTT_THRESHOLD +#define POLYEVALT_NTT_THRESHOLD 512 +#endif + +#ifndef MPZSPV_NORMALISE_STRIDE +#define MPZSPV_NORMALISE_STRIDE 512 +#endif + +/* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ +#ifndef TUNE_MULREDC_TABLE +#define TUNE_MULREDC_TABLE {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} +#endif + +/* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ +#ifndef TUNE_SQRREDC_TABLE +#define TUNE_SQRREDC_TABLE {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} +#endif + +#ifndef LIST_MUL_TABLE +#define LIST_MUL_TABLE {0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3} +#endif diff -Nru gmp-ecm-7.0.4+ds/generic/params22.h gmp-ecm-7.0.5+ds/generic/params22.h --- gmp-ecm-7.0.4+ds/generic/params22.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/generic/params22.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,53 @@ +#ifndef MPZMOD_THRESHOLD +#define MPZMOD_THRESHOLD 170 +#endif + +#ifndef REDC_THRESHOLD +#define REDC_THRESHOLD 294 +#endif + +#ifndef MPN_MUL_LO_THRESHOLD_TABLE +#define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} +#endif + +#ifndef NTT_GFP_TWIDDLE_DIF_BREAKOVER +#define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 +#endif + +#ifndef NTT_GFP_TWIDDLE_DIT_BREAKOVER +#define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 +#endif + +#ifndef MUL_NTT_THRESHOLD +#define MUL_NTT_THRESHOLD 1024 +#endif + +#ifndef PREREVERTDIVISION_NTT_THRESHOLD +#define PREREVERTDIVISION_NTT_THRESHOLD 64 +#endif + +#ifndef POLYINVERT_NTT_THRESHOLD +#define POLYINVERT_NTT_THRESHOLD 512 +#endif + +#ifndef POLYEVALT_NTT_THRESHOLD +#define POLYEVALT_NTT_THRESHOLD 512 +#endif + +#ifndef MPZSPV_NORMALISE_STRIDE +#define MPZSPV_NORMALISE_STRIDE 512 +#endif + +/* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ +#ifndef TUNE_MULREDC_TABLE +#define TUNE_MULREDC_TABLE {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} +#endif + +/* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ +#ifndef TUNE_SQRREDC_TABLE +#define TUNE_SQRREDC_TABLE {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} +#endif + +#ifndef LIST_MUL_TABLE +#define LIST_MUL_TABLE {0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3} +#endif diff -Nru gmp-ecm-7.0.4+ds/generic/params33.h gmp-ecm-7.0.5+ds/generic/params33.h --- gmp-ecm-7.0.4+ds/generic/params33.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/generic/params33.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,53 @@ +#ifndef MPZMOD_THRESHOLD +#define MPZMOD_THRESHOLD 170 +#endif + +#ifndef REDC_THRESHOLD +#define REDC_THRESHOLD 294 +#endif + +#ifndef MPN_MUL_LO_THRESHOLD_TABLE +#define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} +#endif + +#ifndef NTT_GFP_TWIDDLE_DIF_BREAKOVER +#define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 +#endif + +#ifndef NTT_GFP_TWIDDLE_DIT_BREAKOVER +#define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 +#endif + +#ifndef MUL_NTT_THRESHOLD +#define MUL_NTT_THRESHOLD 1024 +#endif + +#ifndef PREREVERTDIVISION_NTT_THRESHOLD +#define PREREVERTDIVISION_NTT_THRESHOLD 64 +#endif + +#ifndef POLYINVERT_NTT_THRESHOLD +#define POLYINVERT_NTT_THRESHOLD 512 +#endif + +#ifndef POLYEVALT_NTT_THRESHOLD +#define POLYEVALT_NTT_THRESHOLD 512 +#endif + +#ifndef MPZSPV_NORMALISE_STRIDE +#define MPZSPV_NORMALISE_STRIDE 512 +#endif + +/* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ +#ifndef TUNE_MULREDC_TABLE +#define TUNE_MULREDC_TABLE {3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3} +#endif + +/* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ +#ifndef TUNE_SQRREDC_TABLE +#define TUNE_SQRREDC_TABLE {3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3} +#endif + +#ifndef LIST_MUL_TABLE +#define LIST_MUL_TABLE {0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3} +#endif diff -Nru gmp-ecm-7.0.4+ds/gpu_throughput_test.sh gmp-ecm-7.0.5+ds/gpu_throughput_test.sh --- gmp-ecm-7.0.4+ds/gpu_throughput_test.sh 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/gpu_throughput_test.sh 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,29 @@ +#!/usr/bin/bash + +ECM=${1:-./ecm} +DEFAULT_CURVES=$(echo "2^31-1" | ./ecm -gpu 10 | grep -oP '[0-9]*(?= parallel curves)') +C=${2:-$DEFAULT_CURVES} +B1=128000 + +if [[ "$C" -ne "$DEFAULT_CURVES" ]]; then + echo "Changed DEFAULT_CURVES from $DEFAULT_CURVES to $C" +fi + +filtered() { + echo "$1" | $ECM -v $2 $B1 0 2>&1 | grep -P "CGBN|Step" +} + +for number in "(2^269-1)/13822297" "(2^499-1)/20959" "2^997-1" "(2^1877-1)/15017"; do + echo -e "\n\nTESTING $number B1=$B1" + filtered "$number" + filtered "$number" "-gpu" + echo + + curve_test="$((C / 4)) $((C / 2)) $C $((C * 2)) $((C * 4)) $((C * 8))" + for curves in $curve_test; do + filtered "$number" "-cgbn -gpucurves $curves" + echo + done + + B1=$((B1 / 2)) +done diff -Nru gmp-ecm-7.0.4+ds/hecm/ariKS.c gmp-ecm-7.0.5+ds/hecm/ariKS.c --- gmp-ecm-7.0.4+ds/hecm/ariKS.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/ariKS.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,692 @@ +#include "ariKS.h" + +void ksPoint_init(ksPoint p,mpmod_t n) { + mpres_init (p->X,n); + mpres_init (p->Y,n); + mpres_init (p->Z,n); + mpres_init (p->T,n); +} + +void ksPoint_clear(ksPoint p,mpmod_t n) { + mpres_clear (p->X,n); + mpres_clear (p->Y,n); + mpres_clear (p->Z,n); + mpres_clear (p->T,n); +} + + + +void ksCstPourMul_init (ksCstPourMul cMul,mpmod_t n) { + mpres_init (cMul->invZ, n); + mpres_init (cMul->invT, n); + mpres_init (cMul->x0p, n); + mpres_init (cMul->t0p, n); + mpres_init (cMul->Y0, n); + mpres_init (cMul->Z0, n); +} + +void ksCstPourMul_clear (ksCstPourMul cMul,mpmod_t n) { + mpres_clear (cMul->invZ, n); + mpres_clear (cMul->invT, n); + mpres_clear (cMul->x0p, n); + mpres_clear (cMul->t0p, n); + mpres_clear (cMul->Y0, n); + mpres_clear (cMul->Z0, n); +} + + + + + + + +/* + Multiply the point P on the Kummer surface by k and put the result in P + The point P is such that X=-Y + The zero of the Kummer surface is [1,be,ga,1] since al=de=1 +*/ +void mulKS(ksPoint P, + ksCstPourMul cMul, + mpres_t be,mpres_t ga, + mpz_t k, + mpmod_t n) { + + int p; + // printf("warning: p est un int et on y met un size_t\n"); + ksPoint Pi; + mpres_t u,v; + mpres_init (u,n); + mpres_init (v,n); + + ksPoint_init (Pi,n); + + + + if ( !mpz_sgn(k)) { // k=0 + mpres_set_ui (P->X,1,n); + mpres_set (P->Y,be,n); + mpres_set (P->Z,ga,n); + mpres_set_ui (P->T,1,n); + } + + else if ( mpz_sgn(k) < 0 ) { // k < 0 + mpz_neg (k,k); // k=-k + } + + else if ( !mpz_cmp_ui(k,2) ) { // k=2 + doubleKS2(P,cMul,u,v,n); + } + + + else if ( mpz_cmp_ui(k,2) > 0 ) { // k>2 + doubleKS(Pi,P,cMul,u,v,n); + + for (p=(mpz_sizeinbase(k,2)-2);p>=0;p--) { + if ( mpz_tstbit (k,p) ) { + loopMulKS(Pi,P,cMul,u,v,n); + } + else { + loopMulKS(P,Pi,cMul,u,v,n); + } + // + // We could save a few operations on the last bit of k + // We have computed k*P and (k+1)*P. + } + } + + ksPoint_clear (Pi,n); + mpres_clear (u,n); + mpres_clear (v,n); +} + + + + + + +/* + Double a point P=[X,Y,Z,T] on the Kummer surface and put the result in P +*/ +void doubleKS2(ksPoint P, + ksCstPourMul cMul, + mpres_t u,mpres_t v, + mpmod_t n) { + + ksPoint Q; + ksPoint_init (Q,n); + + /* magma + Xi := x0p*(X + Y + Z + T)^2; + Yi := (X + Y - Z - T)^2; + Zi := z0p*(X - Y + Z - T)^2; // z0p=-1 + Ti := t0p*(X - Y - Z + T)^2; + + x2 := (Xi + Yi + Zi + Ti)^2; + y2 := Y0*(Xi + Yi - Zi - Ti)^2; + z2 := Z0*(Xi - Yi + Zi - Ti)^2; + t2 := T0*(Xi - Yi - Zi + Ti)^2; // T0=1 + */ + + hadamard (Q,P,u,v,n); + // Pi->X = X + Y + Z + T + // Pi->Y = X + Y - Z - T + // Pi->Z = X - Y + Z - T + // Pi->T = X - Y - Z + T + + + + mpres_mul (Q->X ,Q->X ,Q->X ,n); + mpres_mul (Q->Y ,Q->Y ,Q->Y ,n); // Yi + mpres_mul (Q->Z ,Q->Z ,Q->Z ,n); + mpres_mul (Q->T ,Q->T ,Q->T ,n); + + mpres_neg (Q->Z ,Q->Z ,n); // Zi + // We could change the sign of Zi later + mpres_mul (Q->X ,Q->X ,cMul->x0p ,n); // Xi + mpres_mul (Q->T ,Q->T ,cMul->t0p ,n); // Yi + + + hadamard (P,Q,u,v,n); + // P->X = Xi + Yi + Zi + Ti + // P->Y = Xi + Yi - Zi - Ti + // P->Z = Xi - Yi + Zi - Ti + // P->T = Xi - Yi - Zi + Ti + + + + mpres_mul (P->X ,P->X ,P->X ,n); // x2 + mpres_mul (P->Y ,P->Y ,P->Y ,n); + mpres_mul (P->Z ,P->Z ,P->Z ,n); + mpres_mul (P->T ,P->T ,P->T ,n); // t2 + + mpres_mul (P->Y ,P->Y ,cMul->Y0 ,n); // y2 + mpres_mul (P->Z ,P->Z ,cMul->Z0 ,n); // z2 + + + + ksPoint_clear (Q,n); + + +} + + + + + + +/* + Double a point P the Kummer surface and put the result in P2 +*/ +void doubleKS(ksPoint P2,const ksPoint P, + ksCstPourMul cMul, + mpres_t u,mpres_t v, + mpmod_t n) { + + ksPoint Pi; + ksPoint_init (Pi,n); + + + + /* magma + Xi := x0p*(X + Y + Z + T)^2; + Yi := (X + Y - Z - T)^2; + Zi := z0p*(X - Y + Z - T)^2; // z0p=-1 + Ti := t0p*(X - Y - Z + T)^2; + + X2 := (Xi + Yi + Zi + Ti)^2; + Y2 := Y0*(Xi + Yi - Zi - Ti)^2; + Z2 := Z0*(Xi - Yi + Zi - Ti)^2; + T2 := T0*(Xi - Yi - Zi + Ti)^2; // T0=1 + */ + + + + hadamard (Pi,P,u,v,n); + // Pi->x = (X + Y + Z + T) + // Pi->y = (X + Y - Z - T) + // Pi->z = (X - Y + Z - T) + // Pi->t = (X - Y - Z + T) + + + + mpres_mul (Pi->X ,Pi->X ,Pi->X ,n); + mpres_mul (Pi->Y ,Pi->Y ,Pi->Y ,n); // Yi + mpres_mul (Pi->Z ,Pi->Z ,Pi->Z ,n); + mpres_mul (Pi->T ,Pi->T ,Pi->T ,n); + mpres_neg (Pi->Z ,Pi->Z ,n); // Zi + // We could change the sign of Zi later + mpres_mul (Pi->X ,Pi->X ,cMul->x0p ,n); // Xi + mpres_mul (Pi->T ,Pi->T ,cMul->t0p ,n); // Ti + + + hadamard (P2,Pi,u,v,n); + // P2->X = Xi + Yi + Zi + Ti + // P2->Y = Xi + Yi - Zi - Ti + // P2->Z = Xi - Yi + Zi - Ti + // P2->T = Xi - Yi - Zi + Ti + + + + mpres_mul (P2->X ,P2->X ,P2->X ,n); // X2 + mpres_mul (P2->Y ,P2->Y ,P2->Y ,n); + mpres_mul (P2->Z ,P2->Z ,P2->Z ,n); + mpres_mul (P2->T ,P2->T ,P2->T ,n); // T2 + + mpres_mul (P2->Y ,P2->Y ,cMul->Y0 ,n); // Y2 + mpres_mul (P2->Z ,P2->Z ,cMul->Z0 ,n); // Z2 + + + + ksPoint_clear (Pi,n); +} + + + + + + +/* + One step during the multiplication loop. + We have Pm=[X,Y,Z,T] and Pp=[x,y,z,t] with Pm-Pp equal the initial point + We want Pm <- 2*Pm=[X,Y,Z,T] and Pp <- Pm+Pp=[x,y,z,t] +*/ +void loopMulKS(ksPoint Pm,ksPoint Pp, + ksCstPourMul cMul, + mpres_t u,mpres_t v, + mpmod_t n) { + + ksPoint Qm,Qp; + mpres_t U,V; + + ksPoint_init (Qm,n); // Let Qm = [X2,Y2,Z2,T2] + ksPoint_init (Qp,n); // Let Qp = [x2,y2,z2,t2] + + mpres_init (U, n); + mpres_init (V, n); + + + + hadamard (Qm,Pm,u,v,n); + // Qm->X = X2 = X + Y + Z + T + // Qm->Y = Y2 = X + Y - Z - T + // Qm->Z = Z2 = X - Y + Z - T + // Qm->T = T2 = X - Y - Z + T + + mpres_mul (U ,Qm->X ,cMul->x0p ,n); // U = x0p * (X + Y + Z + T) + mpres_mul (V ,Qm->T ,cMul->t0p ,n); // V = t0p * (X - Y - Z + T) + + + hadamard (Qp,Pp,u,v,n); + // Qp->X = x2 = x + y + z + t + // Qp->Y = y2 = x + y - z - t + // Qp->Z = z2 = x - y + z - t + // Qp->T = t2 = x - y - z + t + + + + // PseudoAdd + + mpres_mul (Qp->X ,Qp->X ,U ,n); // Qp->X = x0p * (X+Y+Z+T) * (x+y+z+t) + mpres_mul (Qp->Y ,Qp->Y ,Qm->Y ,n); // Qp->Y = (X+Y-Z-T) * (x+y-z-t) + mpres_mul (Qp->Z ,Qp->Z ,Qm->Z ,n); + mpres_neg (Qp->Z ,Qp->Z ,n); // Qp->Z = z0p * (X-Y+Z-T) * (x-y+z-t) + mpres_mul (Qp->T ,Qp->T ,V ,n); // Qp->T = t0p * (X-Y-Z+T) * (x-y-z+t) + + hadamard (Pp,Qp,u,v,n); + // Pp->X = x2 + y2 + z2 + t2 + // Pp->Y = x2 + y2 - z2 - t2 + // Pp->Z = x2 - y2 + z2 - t2 + // Pp->T = x2 - y2 - z2 + t2 + + mpres_mul (Pp->X ,Pp->X ,Pp->X ,n); // Pp->X = (x2+y2+z2+t2)^2 + mpres_mul (Pp->Y ,Pp->Y ,Pp->Y ,n); + mpres_neg (Pp->Y ,Pp->Y ,n); // Pp->Y = invY * (x2+y2-z2-t2)^2 + mpres_mul (Pp->Z ,Pp->Z ,Pp->Z ,n); + mpres_mul (Pp->Z ,Pp->Z ,cMul->invZ ,n); // Pp->Z = invZ * (x2-y2+z2-t2)^2 + mpres_mul (Pp->T ,Pp->T ,Pp->T ,n); + mpres_mul (Pp->T ,Pp->T ,cMul->invT ,n); // Pp->T = invT * (x2-y2-z2+t2)^2 + + + + // Double + + mpres_mul (Qm->X ,Qm->X ,U ,n); // Qm->X = x0p * (X + Y + Z + T)^2 + mpres_mul (Qm->Y ,Qm->Y ,Qm->Y ,n); // Qm->Y = (X + Y - Z - T)^2 + mpres_mul (Qm->Z ,Qm->Z ,Qm->Z ,n); + mpres_neg (Qm->Z ,Qm->Z ,n); // Qm->Z = z0p * (X - Y + Z - T)^2 + mpres_mul (Qm->T ,Qm->T ,V ,n); // Qm->T = t0p * (X - Y - Z + T)^2 + + hadamard (Pm,Qm,u,v,n); + // Pm->X = X2 + Y2 + Z2 + T2 + // Pm->Y = X2 + Y2 - Z2 - T2 + // Pm->Z = X2 - Y2 + Z2 - T2 + // Pm->T = X2 - Y2 - Z2 + T2 + + mpres_mul (Pm->X ,Pm->X ,Pm->X ,n); // Pm->X = (X2 + Y2 + Z2 + T2)^2 + mpres_mul (Pm->Y ,Pm->Y ,Pm->Y ,n); + mpres_mul (Pm->Y ,Pm->Y ,cMul->Y0 ,n); // Pm->Y = Y0 * (X2 + Y2 - Z2 - T2)^2 + mpres_mul (Pm->Z ,Pm->Z ,Pm->Z ,n); + mpres_mul (Pm->Z ,Pm->Z ,cMul->Z0 ,n); // Pm->Z = Z0 * (X2 - Y2 + Z2 - T2)^2 + mpres_mul (Pm->T ,Pm->T ,Pm->T ,n); // Pm->T = (X2 - Y2 - Z2 + T2)^2 + + + + ksPoint_clear (Qm,n); + ksPoint_clear (Qp,n); + + mpres_clear (U, n); + mpres_clear (V, n); + +} + + + + + + + + + + + + + + +// ************* Small parameters ***************** + + + + + + + + +/* + Double a point P=[X,Y,Z,T] on the Kummer surface and put the result in P2 + We use small parameters +*/ +void doubleKSsmallParam(ksPoint P2,const ksPoint P, + ksSmallConstPourMul cMul, + mpres_t u,mpres_t v, + mpmod_t n) { + + ksPoint Pi; + ksPoint_init (Pi,n); + + + + /* magma + Xi := x0p*(X + Y + Z + T)^2; + Yi := y0p*(X + Y - Z - T)^2; + Zi := z0p*(X - Y + Z - T)^2; + Ti := t0p*(X - Y - Z + T)^2; + + X2 := X0*(Xi + Yi + Zi + Ti)^2; + Y2 := Y0*(Xi + Yi - Zi - Ti)^2; + Z2 := Z0*(Xi - Yi + Zi - Ti)^2; + T2 := T0*(Xi - Yi - Zi + Ti)^2; + */ + + + + hadamard (Pi,P,u,v,n); + // Pi->x = (X + Y + Z + T) + // Pi->y = (X + Y - Z - T) + // Pi->z = (X - Y + Z - T) + // Pi->t = (X - Y - Z + T) + + mpres_mul (Pi->X ,Pi->X ,Pi->X ,n); + mpres_mul (Pi->Y ,Pi->Y ,Pi->Y ,n); + mpres_mul (Pi->Z ,Pi->Z ,Pi->Z ,n); + mpres_mul (Pi->T ,Pi->T ,Pi->T ,n); + + mpres_muldivbysomething_si (Pi->X ,Pi->X ,cMul->x0p ,n); + mpres_muldivbysomething_si (Pi->Y ,Pi->Y ,cMul->y0p ,n); + mpres_muldivbysomething_si (Pi->Z ,Pi->Z ,cMul->z0p ,n); + mpres_muldivbysomething_si (Pi->T ,Pi->T ,cMul->t0p ,n); + + + + hadamard (P2,Pi,u,v,n); + // P2->X = Xi + Yi + Zi + Ti + // P2->Y = Xi + Yi - Zi - Ti + // P2->Z = Xi - Yi + Zi - Ti + // P2->T = Xi - Yi - Zi + Ti + + mpres_mul (P2->X ,P2->X ,P2->X ,n); + mpres_mul (P2->Y ,P2->Y ,P2->Y ,n); + mpres_mul (P2->Z ,P2->Z ,P2->Z ,n); + mpres_mul (P2->T ,P2->T ,P2->T ,n); + + mpres_muldivbysomething_si (P2->X ,P2->X ,cMul->X0 ,n); + mpres_muldivbysomething_si (P2->Y ,P2->Y ,cMul->Y0 ,n); + mpres_muldivbysomething_si (P2->Z ,P2->Z ,cMul->Z0 ,n); + mpres_muldivbysomething_si (P2->T ,P2->T ,cMul->T0 ,n); + + + + ksPoint_clear (Pi,n); +} + + + + + + +/* + Double a point P=[X,Y,Z,T] on the Kummer surface and put the result in P + We use small parameters +*/ +void doubleKSsmallParam2(ksPoint P, + ksSmallConstPourMul cMul, + ksPoint Pi,mpres_t u,mpres_t v, + mpmod_t n) { + + /* magma + Xi := x0p*(X + Y + Z + T)^2; + Yi := y0p*(X + Y - Z - T)^2; + Zi := z0p*(X - Y + Z - T)^2; + Ti := t0p*(X - Y - Z + T)^2; + + X := X0*(Xi + Yi + Zi + Ti)^2; + Y := Y0*(Xi + Yi - Zi - Ti)^2; + Z := Z0*(Xi - Yi + Zi - Ti)^2; + T := T0*(Xi - Yi - Zi + Ti)^2; + */ + + + hadamard (Pi,P,u,v,n); + // Pi->x = (X + Y + Z + T) + // Pi->y = (X + Y - Z - T) + // Pi->z = (X - Y + Z - T) + // Pi->t = (X - Y - Z + T) + + mpres_mul (Pi->X ,Pi->X ,Pi->X ,n); + mpres_mul (Pi->Y ,Pi->Y ,Pi->Y ,n); + mpres_mul (Pi->Z ,Pi->Z ,Pi->Z ,n); + mpres_mul (Pi->T ,Pi->T ,Pi->T ,n); + + mpres_muldivbysomething_si (Pi->X ,Pi->X ,cMul->x0p ,n); + mpres_muldivbysomething_si (Pi->Y ,Pi->Y ,cMul->y0p ,n); + mpres_muldivbysomething_si (Pi->Z ,Pi->Z ,cMul->z0p ,n); + mpres_muldivbysomething_si (Pi->T ,Pi->T ,cMul->t0p ,n); + + + + hadamard (P,Pi,u,v,n); + // P->X = Xi + Yi + Zi + Ti + // P->Y = Xi + Yi - Zi - Ti + // P->Z = Xi - Yi + Zi - Ti + // P->T = Xi - Yi - Zi + Ti + + mpres_mul (P->X ,P->X ,P->X ,n); + mpres_mul (P->Y ,P->Y ,P->Y ,n); + mpres_mul (P->Z ,P->Z ,P->Z ,n); + mpres_mul (P->T ,P->T ,P->T ,n); + + mpres_muldivbysomething_si (P->X ,P->X ,cMul->X0 ,n); + mpres_muldivbysomething_si (P->Y ,P->Y ,cMul->Y0 ,n); + mpres_muldivbysomething_si (P->Z ,P->Z ,cMul->Z0 ,n); + mpres_muldivbysomething_si (P->T ,P->T ,cMul->T0 ,n); + + +} + + + + +/* + Multiply a point P on the Kummer surface by k and put the result in P + We use small paramters + The zero on the Kummer surface is [1,be,ga,1] since al=de=1 +*/ +void mulKSsmallParam(ksPoint P, + ksSmallConstPourMul cMul, + mpres_t be,mpres_t ga, + mpz_t k, + mpmod_t n) { + + int p; + // printf("warning: p est un int et on y met un size_t\n"); + ksPoint Q,R; + mpres_t u,v; + + ksPoint_init (Q,n); + ksPoint_init (R,n); + mpres_init (u,n); + mpres_init (v,n); + + + if ( !mpz_sgn(k)) { // k=0 + mpres_set_ui (P->X,1,n); + mpres_set (P->Y,be,n); + mpres_set (P->Z,ga,n); + mpres_set_ui (P->T,1,n); + } + + else if ( mpz_sgn(k) < 0 ) { // k < 0 + mpz_neg (k,k); // k=-k + } + + else if ( !mpz_cmp_ui(k,2) ) { // k=2 + doubleKSsmallParam2 (P ,cMul ,R,u,v ,n); + } + + + else if ( mpz_cmp_ui(k,2) > 0 ) { // k>2 + doubleKSsmallParam (Q ,P ,cMul ,u,v ,n); + + for (p=(mpz_sizeinbase(k,2)-2);p>=0;p--) { + if ( mpz_tstbit (k,p) ) { + // P <- P+Q // Q <- 2*Q + loopMulKSsmallParam (P,Q,cMul,R,u,v,n); + + } + else { + // Q <- Q+P // P <- 2*P + loopMulKSsmallParam (Q,P,cMul,R,u,v,n); + + + } + // + // We could save a few operations on the last bit of k + // We have computed k*P and (k+1)*P. + } + } + + ksPoint_clear (Q,n); + ksPoint_clear (R,n); + mpres_clear (u,n); + mpres_clear (v,n); +} + + + + + + +/* + The loop for multiplication on the Kummer surface with small constants + Imput: two points P,Q + Output P <- P+Q and Q <- 2*Q +*/ +void loopMulKSsmallParam (ksPoint P,ksPoint Q,ksSmallConstPourMul cMul, + ksPoint R,mpres_t u,mpres_t v,mpmod_t n) { + + + // hadamard (R,Q,u,v,n); + mpres_add (u ,Q->X ,Q->Y ,n); + mpres_add (v ,Q->Z ,Q->T ,n); + mpres_add (R->X ,u ,v ,n); + mpres_sub (R->Y ,u ,v ,n); + mpres_sub (v ,Q->X ,Q->Y ,n); + mpres_sub (u ,Q->Z ,Q->T ,n); + mpres_add (R->Z ,v ,u ,n); + mpres_sub (R->T ,v ,u ,n); + // hadamard (Q,P,u,v,n); + mpres_add (u ,P->X ,P->Y ,n); + mpres_add (v ,P->Z ,P->T ,n); + mpres_add (Q->X ,u ,v ,n); + mpres_sub (Q->Y ,u ,v ,n); + mpres_sub (v ,P->X ,P->Y ,n); + mpres_sub (u ,P->Z ,P->T ,n); + mpres_add (Q->Z ,v ,u ,n); + mpres_sub (Q->T ,v ,u ,n); + + mpres_mul (P->X ,Q->X ,R->X ,n); + mpres_mul (P->Y ,Q->Y ,R->Y ,n); + mpres_mul (P->Z ,Q->Z ,R->Z ,n); + mpres_mul (P->T ,Q->T ,R->T ,n); + mpres_muldivbysomething_si (P->X ,P->X ,cMul->x0p ,n); + mpres_muldivbysomething_si (P->Y ,P->Y ,cMul->y0p ,n); + mpres_muldivbysomething_si (P->Z ,P->Z ,cMul->z0p ,n); + mpres_muldivbysomething_si (P->T ,P->T ,cMul->t0p ,n); + + + // hadamard (Q,P,u,v,n); + mpres_add (u ,P->X ,P->Y ,n); + mpres_add (v ,P->Z ,P->T ,n); + mpres_add (Q->X ,u ,v ,n); + mpres_sub (Q->Y ,u ,v ,n); + mpres_sub (v ,P->X ,P->Y ,n); + mpres_sub (u ,P->Z ,P->T ,n); + mpres_add (Q->Z ,v ,u ,n); + mpres_sub (Q->T ,v ,u ,n); + + + + mpres_mul (P->X ,Q->X ,Q->X ,n); + mpres_mul (P->Y ,Q->Y ,Q->Y ,n); + mpres_mul (P->Z ,Q->Z ,Q->Z ,n); + mpres_mul (P->T ,Q->T ,Q->T ,n); + mpres_muldivbysomething_si (P->X ,P->X ,cMul->invX ,n); + mpres_muldivbysomething_si (P->Y ,P->Y ,cMul->invY ,n); + mpres_muldivbysomething_si (P->Z ,P->Z ,cMul->invZ ,n); + mpres_muldivbysomething_si (P->T ,P->T ,cMul->invT ,n); + + + mpres_mul (Q->X ,R->X ,R->X ,n); + mpres_mul (Q->Y ,R->Y ,R->Y ,n); + mpres_mul (Q->Z ,R->Z ,R->Z ,n); + mpres_mul (Q->T ,R->T ,R->T ,n); + mpres_muldivbysomething_si (Q->X ,Q->X ,cMul->x0p ,n); + mpres_muldivbysomething_si (Q->Y ,Q->Y ,cMul->y0p ,n); + mpres_muldivbysomething_si (Q->Z ,Q->Z ,cMul->z0p ,n); + mpres_muldivbysomething_si (Q->T ,Q->T ,cMul->t0p ,n); + + // hadamard (R,Q,u,v,n); + mpres_add (u ,Q->X ,Q->Y ,n); + mpres_add (v ,Q->Z ,Q->T ,n); + mpres_add (R->X ,u ,v ,n); + mpres_sub (R->Y ,u ,v ,n); + mpres_sub (v ,Q->X ,Q->Y ,n); + mpres_sub (u ,Q->Z ,Q->T ,n); + mpres_add (R->Z ,v ,u ,n); + mpres_sub (R->T ,v ,u ,n); + + mpres_mul (Q->X ,R->X ,R->X ,n); + mpres_mul (Q->Y ,R->Y ,R->Y ,n); + mpres_mul (Q->Z ,R->Z ,R->Z ,n); + mpres_mul (Q->T ,R->T ,R->T ,n); + mpres_muldivbysomething_si (Q->X ,Q->X ,cMul->X0 ,n); + mpres_muldivbysomething_si (Q->Y ,Q->Y ,cMul->Y0 ,n); + mpres_muldivbysomething_si (Q->Z ,Q->Z ,cMul->Z0 ,n); + mpres_muldivbysomething_si (Q->T ,Q->T ,cMul->T0 ,n); + +} + + + + + + +// ***************** product of Hadamard ************* + + + + +/* + Compute the product of the Hadamard matrix with the vector P + Put the result in P + WARNING: don't do hadamard(P,P,...) + */ +void hadamard (ksPoint Pi,const ksPoint P, + mpres_t u,mpres_t v, + mpmod_t n) { + + + mpres_add (u ,P->X ,P->Y ,n); // u = X + Y + mpres_add (v ,P->Z ,P->T ,n); // v = Z + T + + mpres_add (Pi->X ,u ,v ,n); // Pi->X = u + v = (X + Y + Z + T) + mpres_sub (Pi->Y ,u ,v ,n); // Pi->Y = u - v = (X + Y - Z - T) + + + mpres_sub (v ,P->X ,P->Y ,n); // v = X - Y + mpres_sub (u ,P->Z ,P->T ,n); // u = Z - T + + mpres_add (Pi->Z ,v ,u ,n); // Pi->Z = v + u = (X - Y + Z - T) + mpres_sub (Pi->T ,v ,u ,n); // Pi->T = v - u = (X - Y - Z + T) + + +} diff -Nru gmp-ecm-7.0.4+ds/hecm/ariKS.h gmp-ecm-7.0.5+ds/hecm/ariKS.h --- gmp-ecm-7.0.4+ds/hecm/ariKS.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/ariKS.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,78 @@ +#ifndef _ARI_KS_H +#define _ARI_KS_H + +#include "../ecm-impl.h" +#include "auxi.h" + + +struct ksCstPourMul_s { + mpres_t invZ; + mpres_t invT; + mpres_t x0p; + mpres_t t0p; + mpres_t Y0; + mpres_t Z0; +}; +typedef struct ksCstPourMul_s ksCstPourMul[1]; + +void ksCstPourMul_init (ksCstPourMul cMul,mpmod_t n); +void ksCstPourMul_clear (ksCstPourMul cMul,mpmod_t n); + + +struct ksSmallConstPourMul_s { + long invX; + long invY; + long invZ; + long invT; + long x0p; + long y0p; + long z0p; + long t0p; + long X0; + long Y0; + long Z0; + long T0; +}; +typedef struct ksSmallConstPourMul_s ksSmallConstPourMul[1]; + + +struct kspoint_s { + mpres_t X; + mpres_t Y; + mpres_t Z; + mpres_t T; +}; +typedef struct kspoint_s ksPoint[1]; + +void ksPoint_init(ksPoint p,mpmod_t n); +void ksPoint_clear(ksPoint p,mpmod_t n); + + + + +#include "generation.h" + + +void mulKS(ksPoint P,ksCstPourMul cMul,mpres_t be,mpres_t ga,mpz_t k,mpmod_t n); + +void doubleKS2(ksPoint P,ksCstPourMul cMul,mpres_t u,mpres_t v,mpmod_t n); + +void doubleKS(ksPoint P2,const ksPoint P,ksCstPourMul cMul,mpres_t u,mpres_t v,mpmod_t n); + +void loopMulKS(ksPoint Pm,ksPoint Pp,ksCstPourMul cMul,mpres_t u,mpres_t v,mpmod_t n); + + +void doubleKSsmallParam(ksPoint P2,const ksPoint P,ksSmallConstPourMul cMul,mpres_t u,mpres_t v,mpmod_t n); + +void doubleKSsmallParam2(ksPoint P,ksSmallConstPourMul cMul,ksPoint Pi,mpres_t u,mpres_t v,mpmod_t n); + +void mulKSsmallParam(ksPoint P,ksSmallConstPourMul cMul,mpres_t be,mpres_t ga,mpz_t k,mpmod_t n); + +void loopMulKSsmallParam (ksPoint P,ksPoint Q,ksSmallConstPourMul cMul,ksPoint R,mpres_t u,mpres_t v,mpmod_t n); + +void hadamard (ksPoint P2,const ksPoint P,mpres_t u,mpres_t v, mpmod_t n); + + + + +#endif diff -Nru gmp-ecm-7.0.4+ds/hecm/auxi.c gmp-ecm-7.0.5+ds/hecm/auxi.c --- gmp-ecm-7.0.4+ds/hecm/auxi.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/auxi.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,451 @@ +#include "auxi.h" +#include + + +/* + compute k=lcm(2,..,B1) + WARNING: slow version (no product tree) + NOT USED +*/ +void calculk (mpz_t k,double B1) { + double p,r; + mpz_t q; + + mpz_set_ui (k,1); + mpz_init (q); + + for ( p = 2.0; p <= B1; p = getprime () ) { + mpz_set_d (q ,p); + for (r = p; r <= B1; r *= p) { + mpz_mul (k ,k ,q); + } + } + + mpz_clear (q); + +} + + +/* + Compute k=lcm(2,..,B1) with the use of a product tree + This is what is used +*/ +void prodTreeCalculk (mpz_t k,double B1) { + int stop = 0; + int n=0; + double r; + mpz_t t; + + mpz_init (t); + + mpz_set_ui (k,1); + for (r = 2.0; r <= B1; r *= 2.0) { + mpz_mul_ui (k ,k ,2); + } + + while (stop == 0) { + prodTreeCalculkInter (t,B1,n,&stop); + mpz_mul (k ,k ,t); + n++; + } + + mpz_clear (t); +} + + +void prodTreeCalculkInter (mpz_t Pr,double B1, int n, int* pstop) { + double p,r; + mpz_t t1,q; + + mpz_init (t1); + mpz_init (q); + + if (n==0) { + p = getprime(); + if ( p>B1 ) { + mpz_set_ui (Pr,1); + *pstop = 1; + } + else { + mpz_set_ui (Pr,1); + mpz_set_ui (q,p); + for (r = p; r <= B1; r *= p) { + mpz_mul (Pr,Pr,q); + } + } + } + else { // n>0 + if (*pstop == 1) { + mpz_set_ui (Pr,1); + } + else { + prodTreeCalculkInter (Pr,B1,n-1,pstop); + prodTreeCalculkInter (t1,B1,n-1,pstop); + mpz_mul (Pr,Pr,t1); + } + } + + mpz_clear(q); + mpz_clear(t1); + +} + + + + + + +/* + compute k="lcm(2,..,B1)" with one power more for each prime + Use a product tree. + NOT USED +*/ + +void prodTreeCalculkPlus (mpz_t k,double B1) { + int stop = 0; + int n=0; + double r; + mpz_t t; + + mpz_init (t); + + for (r = 2.0; r <= B1; r *= 2.0) { + } + mpz_set_d (k,r); + + while (stop == 0) { + prodTreeCalculkInterPlus (t,B1,n,&stop); + mpz_mul (k ,k ,t); + n++; + } + + mpz_clear (t); +} + + + +// NOT USED +void prodTreeCalculkInterPlus (mpz_t Pr,double B1, int n, int* pstop) { + double p,r; + mpz_t t1; + + mpz_init (t1); + + if (n==0) { + p = getprime(); + if ( p>B1 ) { + mpz_set_ui (Pr,1); + *pstop = 1; + } + else { + for (r = p; r <= B1; r *= p) { + } + mpz_set_d (Pr ,r); + } + } + else { // n>0 + if (*pstop == 1) { + mpz_set_ui (Pr,1); + } + else { + prodTreeCalculkInter (Pr,B1,n-1,pstop); + prodTreeCalculkInter (t1,B1,n-1,pstop); + mpz_mul (Pr,Pr,t1); + } + } + + + mpz_clear(t1); + +} + + + + + + +// ***************************************************************************** + + + + + + + + +void mpalgpol_init (mpalgpol_t pol, mpmod_t n) { + int i; + for (i=0; i< DEGREE_ALGEBRA; i++) { + mpres_init (pol->coeff[i],n); + } + mpres_init (pol->t1,n); + mpalgres_init (pol->aTemp1,n); + mpalgres_init (pol->aTemp2,n); +} + +void mpalgpol_clear (mpalgpol_t pol, mpmod_t n) { + int i; + for (i=0; i< DEGREE_ALGEBRA; i++) { + mpres_clear (pol->coeff[i],n); + } + mpres_clear (pol->t1,n); + mpalgres_clear (pol->aTemp1,n); + mpalgres_clear (pol->aTemp2,n); +} + +// give the gcd of aP[i] with n +void mpalgres_gcd (mpz_t aF[DEGREE_ALGEBRA], mpalgres_t aP, mpmod_t n) { + int i; + + for (i=0; i< DEGREE_ALGEBRA; i++) { + mpres_gcd (aF[i],aP[i],n); + } +} + + + + +void mpalgres_init (mpalgres_t aP, mpmod_t n) { + int i; + + for (i=0; i< DEGREE_ALGEBRA; i++) { + mpres_init (aP[i],n); + } +} + +void mpalgres_clear (mpalgres_t aP, mpmod_t n) { + int i; + + for (i=0; i< DEGREE_ALGEBRA; i++) { + mpres_clear (aP[i],n); + } +} + +/* aR <- aP */ +void mpalgres_set (mpalgres_t aR, mpalgres_t aP,mpmod_t n) { + int i; + for (i=0;i=0 && mpres_is_zero(aP[i],n)) { + i--; + } + return i; +} + + +/* aR <- -aP */ +void mpalgres_neg (mpalgres_t aR, mpalgres_t aP, mpalgpol_t pol, mpmod_t n) { + int i; + for (i=0; it1,aP[DEGREE_ALGEBRA-1]); + for (i=DEGREE_ALGEBRA-1; i > 0; i--) { + mpres_mul (aR[i] ,pol->t1, pol->coeff[i] ,n); + mpres_sub (aR[i] ,aP[i-1], aR[i] ,n); + } + mpres_mul (aR[0] ,pol->t1 ,pol->coeff[0] ,n); + mpres_neg (aR[0] ,aR[0] ,n); + +} + +/* aR <- aP*aQ */ +void mpalgres_mul (mpalgres_t aR, mpalgres_t aP, mpalgres_t aQ, + mpalgpol_t pol, mpmod_t n) { + int i; + + if ( (DEGREE_ALGEBRA==2) && mpres_is_zero(pol->coeff[1],n) ) { + mpres_mul (pol->t1 ,aP[1] ,aQ[1] ,n); + mpres_mul (pol->t1 ,pol->t1 ,pol->coeff[0] ,n); + mpres_mul (pol->aTemp1[0] ,aP[0] ,aQ[0] ,n); + mpres_sub (pol->aTemp1[0] ,pol->aTemp1[0] ,pol->t1 ,n); + + mpres_mul (pol->aTemp1[1] ,aP[0] ,aQ[1] ,n); + mpres_mul (pol->t1 ,aP[1] ,aQ[0] ,n); + mpres_add (pol->aTemp1[1] ,pol->aTemp1[1] ,pol->t1 ,n); + + mpalgres_set (aR ,pol->aTemp1 ,n); + } + else { + mpalgres_set (pol->aTemp1 ,aP ,n); + mpalgres_mul_mpres (aR ,aP ,aQ[0] ,pol,n); + for (i=1;iaTemp1 ,pol->aTemp1 ,pol ,n); + mpalgres_mul_mpres (pol->aTemp2 ,pol->aTemp1 ,aQ[i] ,pol ,n); + mpalgres_add (aR ,aR ,pol->aTemp2 ,pol ,n); + } + } +} + + +/* + Do the inversion of aQ in k[x]/pol(x) where k=Z/nZ + + return -1 if something failed (for instance if gcd(aQ,aP) != 1 + return 0 if it finds a factor of n (in fact if a mpres_invert failled) + return 1 if ok + + Use f to put a factor of n if an inversion in k failled + + TODO do the general case + + +*/ +int mpalgres_invert (mpalgres_t aV, mpalgres_t aQ, + mpalgpol_t pol, mpmod_t n, mpz_t f) { + + mpres_t temp; + + + if ( (DEGREE_ALGEBRA==2) && mpres_is_zero(pol->coeff[1],n) ) { + if (mpalgres_is_zero (aQ,pol,n) ) { + return -1; + } + + + mpz_set ( pol->aTemp1[0] ,aQ[0]); + mpres_neg ( pol->aTemp1[1] ,aQ[1] ,n); + + mpres_mul (pol->t1 ,aQ[0] ,aQ[0] ,n); + mpres_mul (pol->aTemp2[0] ,aQ[1] ,aQ[1] ,n); + mpres_mul (pol->aTemp2[0] ,pol->aTemp2[0] ,pol->coeff[0] ,n); + mpres_add (pol->t1 ,pol->t1 ,pol->aTemp2[0] ,n); + + + + if ( !mpres_invert(pol->t1,pol->t1,n) ) { + mpres_gcd(f,pol->t1,n); + + mpres_init (temp,n); + mpres_set_z (temp ,f ,n); + if ( mpres_is_zero (temp ,n) ) { + mpres_clear (temp ,n); + return -1; + } + else { + mpres_clear (temp ,n); + return 0; + } + } + + mpz_set ( aV[0] ,aQ[0]); + mpres_neg ( aV[1] ,aQ[1] ,n); + mpalgres_mul_mpres (aV ,aV ,pol->t1, pol ,n); + + return 1; + + } + else { // TODO do the general case + return -1; + + } +} diff -Nru gmp-ecm-7.0.4+ds/hecm/auxi.h gmp-ecm-7.0.5+ds/hecm/auxi.h --- gmp-ecm-7.0.4+ds/hecm/auxi.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/auxi.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,82 @@ +#ifndef _auxi_H +#define _auxi_H + +#include "../ecm-impl.h" + +void prodTreeCalculkPlus (mpz_t k,double B1); +void prodTreeCalculkInterPlus (mpz_t Pr,double B1, int n, int* pstop); + +void calculk (mpz_t k,double B1); +void prodTreeCalculk (mpz_t k,double B1); +void prodTreeCalculkInter (mpz_t Pr,double B1, int n, int* pstop); + + + + + +/* + We have to work in finite algebra over Z/nZ + The algebra is k[Y]/pol(Y) where degree pol=DEGREE_ALGEBRA + A special case is when Pol=Y^2+a +*/ + +#define DEGREE_ALGEBRA 2 + + + +typedef mpres_t mpalgres_t[DEGREE_ALGEBRA]; + +// polynomial for creating the algebra K[x]/P(x) +struct mpalgpol +{ + int kind_of_algebra; // if COMPOSITE_ALGEBRA then degree=4! + mpres_t coeff[DEGREE_ALGEBRA]; + mpres_t t1; + mpalgres_t aTemp1; + mpalgres_t aTemp2; +}; +typedef struct mpalgpol mpalgpol_t[1]; + +void mpalgpol_init (mpalgpol_t, mpmod_t); +void mpalgpol_clear (mpalgpol_t, mpmod_t); + + +void mpalgres_init (mpalgres_t, mpmod_t); +void mpalgres_clear (mpalgres_t, mpmod_t); + +void mpalgres_gcd (mpalgres_t, mpalgres_t, mpmod_t); + + +void mpalgres_set (mpalgres_t, mpalgres_t ,mpmod_t ); +void mpalgres_set_zero (mpalgres_t ,mpmod_t ); +void mpalgres_set_ui (mpalgres_t ,unsigned long ,mpmod_t ); +void mpalgres_set_mpres (mpalgres_t ,mpres_t ,mpmod_t ); + +int mpalgres_is_zero (mpalgres_t ,mpalgpol_t ,mpmod_t); +int mpalgres_degree (mpalgres_t ,mpalgpol_t, mpmod_t); + +void mpalgres_neg (mpalgres_t, mpalgres_t, mpalgpol_t, mpmod_t); + +int mpalgres_invert (mpalgres_t, mpalgres_t, mpalgpol_t ,mpmod_t, mpz_t); + +void mpalgres_mul (mpalgres_t, mpalgres_t, mpalgres_t, mpalgpol_t, mpmod_t); +void mpalgres_add (mpalgres_t, mpalgres_t, mpalgres_t, mpalgpol_t, mpmod_t); +void mpalgres_sub (mpalgres_t, mpalgres_t, mpalgres_t, mpalgpol_t, mpmod_t); + + +void mpalgres_mul_ui (mpalgres_t, mpalgres_t, unsigned long,mpalgpol_t,mpmod_t); +void mpalgres_mul_mpres (mpalgres_t, mpalgres_t, mpres_t, mpalgpol_t, mpmod_t); + +void mpalgres_add_mpres (mpalgres_t, mpalgres_t, mpres_t, mpalgpol_t, mpmod_t); +void mpalgres_sub_mpres (mpalgres_t, mpalgres_t, mpres_t, mpalgpol_t, mpmod_t); +void mpalgres_add_ui (mpalgres_t, mpalgres_t, unsigned long,mpalgpol_t,mpmod_t); +void mpalgres_sub_ui (mpalgres_t, mpalgres_t, unsigned long,mpalgpol_t,mpmod_t); + + + + + + + + +#endif diff -Nru gmp-ecm-7.0.4+ds/hecm/generation.c gmp-ecm-7.0.5+ds/hecm/generation.c --- gmp-ecm-7.0.4+ds/hecm/generation.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/generation.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,1689 @@ +#include +#include +#include "generation.h" +#include + + +void thetaCst_init (thetaCst th, mpmod_t n ) { + mpres_init (th->be, n); + mpres_init (th->ga, n); + mpres_init (th->t5p, n); + mpres_init (th->t6p, n); + mpres_init (th->t7p, n); + mpres_init (th->t10p, n); + mpres_init (th->Rac, n); + mpres_init (th->p, n); +} + + +void thetaCst_clear (thetaCst th, mpmod_t n ) { + mpres_clear (th->be, n); + mpres_clear (th->ga, n); + mpres_clear (th->t5p, n); + mpres_clear (th->t6p, n); + mpres_clear (th->t7p, n); + mpres_clear (th->t10p, n); + mpres_clear (th->Rac, n); + mpres_clear (th->p, n); +} + + +void curveHyperEll_init (curveHyperEll cHEll, mpmod_t n ) { + mpres_init (cHEll->la, n); + mpres_init (cHEll->mu, n); + mpres_init (cHEll->nu, n); + mpres_init (cHEll->q, n); +} + +void curveHyperEll_clear (curveHyperEll cHEll, mpmod_t n ) { + mpres_clear (cHEll->la, n); + mpres_clear (cHEll->mu, n); + mpres_clear (cHEll->nu, n); + mpres_clear (cHEll->q, n); +} + + + +void paraGenCurve_init (paraGenCurve para, mpmod_t n ) { + mpres_init (para->s,n); + mpz_init (para->a); + mpz_init (para->b); + mpres_init (para->x,n); + mpres_init (para->y,n); +} + +void paraGenCurve_clear (paraGenCurve para, mpmod_t n ) { + mpres_clear (para->s,n); + mpz_clear (para->a); + mpz_clear (para->b); + mpres_clear (para->x,n); + mpres_clear (para->y,n); +} + + + + + + + + + + + +// ********** normal parametrization ********* + + + + +/* + Generate a correct hyperelliptic curve on Z/nZ + Begin with the given parameter and change it if needed + if the curve is uncorrect (i.e. one constant is zero) then do Nextparam(s) + the program finishes since s <= n + After having choosen s we choose nJacobi. + TODO this is not a good idea. we should fixed s and increase nJacobi to keep + the advantage of small s + N_JACOBI_MIN <= nJacobi < N_JACOBI_MAX_p1 random +*/ +int generateNormalCurve (mpz_t f,mpmod_t n,paraGenCurve para, + thetaCst th,curveHyperEll cHEll, + ksPoint P,ksCstPourMul cMul, + optionsHECM options) { + + int test; + mpres_t u,g; + + mpres_init (u,n); + mpres_init (g,n); + + + + do { + + test = generateOneNormalCurve (f,n,para,th,cHEll,P,cMul); + mpres_set_z (g,f,n); + + if (test == GENERATION_A_CURVE) { + // We have generate a curve + // check if it is correct + + mpres_mul (u ,th->t7p,th->t6p,n); + mpres_mul (u ,u ,P->X ,n); + mpres_gcd (f ,u ,n); + mpres_set_z (g ,f ,n); + + + if ( !mpz_cmp_ui (f,1) ) { + // general case, f=1, ie X,t6p,t7p non zero modulo all factors of n + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_CORRECT_CURVE; + } + else if ( mpres_is_zero(g,n) != 1 ) { + // f != 0,n i.e. f is a true divisor of n + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + else { // X=0 mod n or t6p =0 mod n or t7p = 0 mod n + nextParam (f,n,para,options); // Let's try again + } + } + else { // We have not generated a curve + if ( mpres_is_zero(g,n) == 1 ) { + // We tried to divide by 0 + nextParam (f,n,para,options); // Let's try again + } + else { + // f != 0,n i.e. f is a true divisor of n + // ( f!=1 because otherwise the inversion would have work) + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + + } + } while ( 1 ); + + +} + + + +/* + Check if a specified curve can be used +*/ +int generateNormalCurveSpecified (mpz_t f,mpmod_t n,paraGenCurve para, + thetaCst th,curveHyperEll cHEll, + ksPoint P,ksCstPourMul cMul) { + + int test; + mpres_t g; + + mpres_init (g,n); + + + test = generateOneNormalCurve (f,n,para,th,cHEll,P,cMul); + mpres_set_z (g,f,n); + + if (test == GENERATION_A_CURVE) { // We have genereted a curve + if ( mpres_is_zero(th->t7p,n) || mpres_is_zero(th->t6p,n)) { + // One of the theta constants t7p or t6p is zero modulo n + // Note that with the current choice of point on the Kummer surface we + // compute 1/t6p so currently t6p != 0 + mpres_clear (g,n); + return GENERATION_FAIL; + } + else { + // The genereted curve is correct modulo n + mpres_gcd (f, P->X, n); + if ( !mpz_cmp_ui (f,1) ) { + // general case, f=1, i.e. X non zero modulo all the factors of n + mpres_clear (g,n); + return GENERATION_CORRECT_CURVE; + } + else if ( mpres_is_zero(g,n) != 1 ) { + // f is a true divisor of n + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + else { // X=0 mod n + mpres_clear (g,n); + return GENERATION_FAIL; + } + } + } + else { // We have not generated a curve + if ( mpres_is_zero(g,n) == 1 ) { + // We tried to divide by 0 + mpres_clear (g,n); + return GENERATION_FAIL; + } + else { + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + } + +} + + + + + + + + + + + + + +/* + generate a curve with given s and nJacobi +*/ +int generateOneNormalCurve (mpz_t f,mpmod_t n,paraGenCurve para, + thetaCst th,curveHyperEll cHEll, + ksPoint P,ksCstPourMul cMul) { + + int test; + mpres_t u,x,ep,dep,v,Delta; + + mpres_init (u, n); + mpres_init (ep, n); + mpres_init (dep, n); + mpres_init (x, n); + mpres_init (v, n); + mpres_init (Delta, n); + + + + if (!mpres_invert (th->be, para->s, n)) // be=1/s + { + mpres_gcd (f, para->s, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + // be=1/s + + mpres_mul (ep ,th->be ,th->be ,n); // ep=be^2=1/s^2 + mpres_ui_sub (dep, 3, ep, n); + mpres_mul (dep,dep,ep,n); // dep = (3-1/s^2)/s^2 + + mpres_ui_sub (u ,1, ep, n); // u=1-1/s^2 + + test = mulJacobi2 (f,n,para->nJacobi,para->x,para->y,u,ep,dep); + // get x and y on the Jacobi curve Y^2 = ep X^4 - dep X^2 + 1 with (1,u) as + // as initial point + + if (test==MULT_JACOBI_FAIL) + { + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + + + mpres_mul (cHEll->q, para->s ,para->s ,n); // q=s^2 + mpres_mul (v, para->x, para->x, n); // v=x^2 + + mpres_sub(cHEll->nu, v, cHEll->q, n); // x^2-s^2 + + + mpres_sub_ui (v ,v ,1 ,n); // v=x^2-1 + if (!mpres_invert (v, v, n)) // v=1/(x^2-1) + { + mpres_gcd (f, v, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + + mpres_mul (cHEll->nu ,cHEll->nu ,v, n); // nu = (x^2-s^2)/(x^2-1) ok + + + mpres_mul (cHEll->mu ,cHEll->nu ,cHEll->nu ,n); // nu^2 + mpres_sub (Delta ,cHEll->q ,cHEll->mu ,n); // Delta=s^2-nu^2 + mpres_add (cHEll->mu ,cHEll->mu ,cHEll->q ,n); // nu^2+s^2 + mpres_sub (cHEll->mu ,cHEll->mu ,cHEll->nu ,n); // mu=-nu+nu^2+s^2 + mpres_mul (cHEll->mu ,cHEll->mu ,ep ,n); // mu = (-nu+nu^2+s^2)/s^2 + + + mpres_sub_ui(cHEll->q ,cHEll->q ,1 ,n); // q=s^2-1 + mpres_mul (v ,v ,v ,n); // v=1/(x^2-1)^2 + mpres_mul (cHEll->q ,cHEll->q ,v ,n); + mpres_mul (cHEll->q ,cHEll->q ,para->y ,n); + mpres_mul (cHEll->q ,cHEll->q ,para->x ,n); // q = x*y*(s^2-1)/(x^2-1)^2 + + // We finished be, nu, mu, q + + + + + + + + mpres_sub_ui (cHEll->la ,cHEll->nu ,1 ,n); + mpres_mul (cHEll->la ,cHEll->la ,cHEll->mu, n); // mu*(nu-1) + mpres_sub_ui (u ,cHEll->mu ,1 ,n); // u=mu-1 + + if (!mpres_invert (u, u, n)) // u=1/(mu-1) + { + mpres_gcd (f, u, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + mpres_mul (cHEll->la ,cHEll->la ,u ,n); // la=mu*(nu-1)/(mu-1) ok + // We finished la + + + + + + + mpres_mul (th->ga ,cHEll->la ,th->be ,n); // ga= la*be + + mpres_sub (th->t7p ,th->ga ,th->be ,n); // t7p=ga-be + mpres_mul (th->Rac ,th->t7p ,cHEll->nu ,n); + mpres_mul (th->Rac ,th->Rac ,th->be ,n); + mpres_neg (th->Rac ,th->Rac ,n); // Rac=nu*be*(be-ga) // Rac = B/t10p + + // We finished ga, t7p et Rac + + + + if (!mpres_invert (u, cHEll->mu, n)) // u=1/mu + { + mpres_gcd (f, cHEll->mu, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + + + mpres_mul (cMul->Z0 ,cHEll->nu ,u ,n); + mpres_mul (cMul->Z0 ,cMul->Z0 ,th->be ,n); // Z0=nu/(mu*s) // Z0=1/ga + + mpres_mul (th->t10p ,cHEll->la ,u ,n); + mpres_mul (th->t10p ,th->t10p ,th->be ,n); // t10p=la/(mu*s) // t10p=1/(be*nu) + + mpres_sub (th->t5p ,th->ga ,th->t10p ,n); // t5p=ga-t10p + mpres_sub (th->t6p ,th->be ,th->t10p ,n); // t6p=be-t10p + + // We finished Z0, t10p, t5p et t6p + + + + mpres_add (cMul->x0p ,th->be ,th->ga ,n); + mpres_ui_sub (cMul->t0p ,2 ,cMul->x0p ,n); // t0p= 2-be-ga = D + mpres_add_ui (cMul->x0p ,cMul->x0p ,2 ,n); // x0p= 2+be+ga = A + + if (!mpres_invert (cMul->x0p,cMul->x0p, n)) // x0p=1/A + { + mpres_gcd (f, cMul->x0p, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + + mpres_mul (dep,th->be,th->ga,n); // dep=be*ga + + mpres_add_ui (u ,th->be ,1 ,n); + mpres_mul (Delta ,Delta ,u ,n); + mpres_add_ui (u ,th->ga ,1 ,n); + mpres_mul (Delta ,Delta ,u ,n); + mpres_mul (Delta ,Delta ,th->t10p ,n); + mpres_mul (Delta ,Delta ,ep, n); + // We finished Delta + + mpres_mul (cMul->invT ,Delta ,cMul->x0p ,n); // Delta/A + + mpres_sub_ui (u ,th->be ,1 ,n); + mpres_sub_ui (v ,th->ga ,1 ,n); + mpres_mul (u ,u ,v ,n); // (be-1)*(ga-1) + mpres_add_ui (v ,dep ,1 ,n); // be*ga+1 + mpres_mul (u ,u ,v ,n); // u=(be-1)*(ga-1)*(be*ga+1) + + mpres_mul_ui (v ,dep ,2 ,n); + mpres_mul (v ,v ,cMul->t0p ,n); // v= 2*be*ga*D + mpres_sub (u ,u ,v ,n); // u=(be-1)*(ga-1)*(be*ga+1) - 2*be*ga*D + + mpres_ui_sub (v ,1 ,dep ,n); // v= 1-be*ga + mpres_mul (cMul->invT ,cMul->invT ,v ,n); // Delta/A*(1-be*ga) + mpres_sub (cMul->invT ,u ,cMul->invT ,n); + // invT = (be-1)*(ga-1)*(be*ga+1) - 2*be*ga*D - Delta/A*(1-be*ga) + + mpres_mul_ui (v ,v ,2 ,n); + if (!mpres_invert (v,v, n)) // x0p=1/(1-be*ga)/2 + { + mpres_gcd (f, v, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + + mpres_mul (cMul->invT ,v ,cMul->invT, n); + // invT = ( (be-1)*(ga-1)*(be*ga+1)+2*be*ga*D-Delta/A*(1-be*ga) )/2/(1-be*ga) + // We finished invT ! + + + + + + + mpres_mul (cMul->x0p ,cMul->x0p ,th->t7p, n); + mpres_ui_sub (cMul->x0p ,0 ,cMul->x0p ,n); // x0p= B/A + + // We finished x0p + + + if (!mpres_invert (cMul->t0p,cMul->t0p, n)) // t0p=1/D + { + mpres_gcd (f, cMul->t0p, n); + + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (v, n); + mpres_clear (x, n); + mpres_clear (Delta, n); + + return GENERATION_NOT_CURVE; + } + + mpres_mul (cMul->t0p ,cMul->t0p ,th->t7p, n); + mpres_ui_sub (cMul->t0p ,0 ,cMul->t0p ,n); // t0p= B/D + + // We finished t0p + + + + + + mpres_mul (P->T ,para->s ,cMul->Z0 ,n); + mpres_ui_sub (P->T, 0, P->T ,n); // T=-sZ0=-1/be/ga + mpres_mul (P->X, cMul->invT ,P->T ,n); + mpres_neg (P->Y ,P->X ,n); + + + mpres_set_ui (P->Z, 1, n); + + mpres_mul (th->p, cMul->x0p ,cMul->t0p ,n); // p=B^2/(A*D) + + + mpres_set (cMul->Y0,para->s,n); // Y0 = s + mpres_set (cMul->invZ,P->X,n); // invZ = X + + + mpres_clear (x, n); + mpres_clear (v, n); + mpres_clear (u, n); + mpres_clear (ep, n); + mpres_clear (dep, n); + mpres_clear (Delta, n); + + + return GENERATION_A_CURVE; +} + + + + + + + + + + + + + + + + + + + +// ********** small parameters ********* + + + + + + +int inverseCoorPointKS (mpz_t f,mpmod_t n, + ksPoint P,ksSmallConstPourMul cMul) { + + mpres_set_si (P->X ,cMul->invX ,n); + if (!mpres_invert (P->X ,P->X ,n)) { + mpres_gcd (f ,P->X ,n); + return INVERSE_NOT_COOR; + } + + + + mpres_set_si (P->Y ,cMul->invY ,n); + if (!mpres_invert (P->Y ,P->Y ,n)) { + mpres_gcd (f ,P->Y ,n); + return INVERSE_NOT_COOR; + } + + + mpres_set_si (P->Z ,cMul->invZ ,n); + if (!mpres_invert (P->Z ,P->Z ,n)) { + mpres_gcd (f ,P->Z ,n); + return INVERSE_NOT_COOR; + } + + + + mpres_set_si (P->T ,cMul->invT ,n); + if (!mpres_invert (P->T ,P->T ,n)) { + mpres_gcd (f ,P->T ,n); + return INVERSE_NOT_COOR; + } + + + return INVERSE_COOR; +} + + + + + +/* + Generate an hyperelliptic curve with small parameters + They are imposed by the choice of a,b and nJacobi + Work with mpz_t even if the goal is that the parameters fit in long + + We begin with creating X0,Y0,.. and x0p,y0p,... + Then we check if they are small (i.e. if they fit in long) + We construct the coordinate of a point on the Kummer surface + Then we check if their inverses are small (i.e. if they fit in long) + Now we have all small parameters and we construct the other +*/ +int generateOneCurveSmallParam (mpz_t f,mpmod_t n, + paraGenCurve para ,thetaCst th, + curveHyperEll cHEll,ksPoint P, + ksSmallConstPourMul cMul, + mpz_t a, mpz_t b, + mpz_t x, mpz_t y, mpz_t z) { + + int test; + + mpz_t thCste[3]; // list to check if the "theta constants" fit in long + mpz_t ThCste[3]; // list to check if the "Theta constants" fit in long + mpz_t ptCste[3]; // list to check if the "point" on KS fit in long + mpz_t g; + mpz_t t1,t2,t3,t4,t5,t6,t7,t8,t9,t0; + mpz_t sa,sb,sx,sz; + + mpz_init (thCste[0]); + mpz_init (thCste[1]); + mpz_init (thCste[2]); + mpz_init (ThCste[0]); + mpz_init (ThCste[1]); + mpz_init (ThCste[2]); + mpz_init (ptCste[2]); + mpz_init (ptCste[0]); + mpz_init (ptCste[1]); + mpz_init (g); + + mpz_init (t1); + mpz_init (t2); + mpz_init (t3); + mpz_init (t4); + mpz_init (t5); + mpz_init (t6); + mpz_init (t7); + mpz_init (t8); + mpz_init (t9); + mpz_init (t0); + + mpz_init (sa); + mpz_init (sb); + mpz_init (sx); + mpz_init (sz); + + + + mpz_mul (sa ,a ,a); // sa = a^2 + mpz_mul (sb ,b ,b); // sb = b^2 + mpz_mul (sx ,x ,x); // sx = x^2 + mpz_mul (sz ,z ,z); // sz = z^2 + + + + mpz_mul (t8 ,a ,sz); + mpz_mul (t7 ,t8 ,t8); // a^2*z^4 + mpz_mul (t9 ,b ,sx); + mpz_add (t3 ,t8 ,t9); + mpz_sub (t0 ,t8 ,t9); // t0 = a*z^2 - b*x^2 + mpz_mul (t3 ,t3 ,t3); // t3 = ( a*z^2 + b*x^2 )^2 + mpz_mul (t4 ,t0 ,t0); // t4 = ( a*z^2 - b*x^2 )^2 + + mpz_mul (t8 ,sa ,sz); + mpz_mul (t1 ,t8 ,t8); + mpz_mul (t9 ,sb ,sx); + mpz_sub (t5 ,t8 ,t9); // t5 = a^2*z^2 - b^2*x^2 + + mpz_mul_ui (t6 ,sz ,2); + mpz_sub (t6 ,sx ,t6); + mpz_mul (t6 ,t6 ,t9); // x^2*b^2*(x^2-2*z^2) + mpz_add (t6 ,t6 ,t7); // t6 = a^2*z^4 - 2*b^2*x^2*z^2 + b^2*x^4 + + mpz_mul (t7 ,sa ,y); + mpz_mul (t7 ,t7 ,t7); + // t7 = b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4 = y^2*a^4 + + + mpz_mul (t1 ,a ,b); // t1 = a*b + mpz_sub (t2 ,sx ,sz); // t2 = x^2 - z^2 + + + + mpz_mul (thCste[0] ,b ,t7); + // thCste[0] = b*(b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4) + mpz_mul (thCste[1] ,a ,t7); + // thCste[1] = a*(b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4) + mpz_mul (thCste[2] ,a ,sb); + mpz_mul (thCste[2] ,thCste[2] ,t5); + mpz_mul (thCste[2] ,thCste[2] ,t2); + mpz_neg (thCste[2] ,thCste[2]); + // thCste[2] = - a * b^2 * (x^2-z^2) * (a^2*z^2 - b^2*x^2) + + mpz_sub (t8 ,a ,b); + mpz_mul (ThCste[0] ,t8 ,t3); + mpz_add (t9 ,a ,b); + mpz_mul (ThCste[2] ,t9 ,t4); + mpz_mul (ThCste[1] ,ThCste[0] ,ThCste[2]); + // ThCste[1] = (a^2-b^2) * (a*z^2+b*x^2)^2 * (a*z^2-b*x^2)^2 + mpz_mul (ThCste[0] ,ThCste[0] ,t6); + mpz_mul (ThCste[0] ,ThCste[0] ,t8); + mpz_neg (ThCste[0] ,ThCste[0]); + // ThCste[0] = - (a-b)^2 * (a*z^2+b*x^2)^2 * (a^4*z^4-2*b^2*x^2*z^2+b^4*z^4) + mpz_mul (ThCste[2] ,ThCste[2] ,t6); + mpz_mul (ThCste[2] ,ThCste[2] ,t9); + // ThCste[2] = (a+b)^2 * (a*z^2-b*x^2)^2 * (a^4*z^4-2*b^2*x^2*z^2+b^4*z^4) + + + + + mpz_gcd (g ,thCste[0] ,thCste[1]); + mpz_gcd (g ,g ,thCste[2]); + if ( mpz_sgn(g) == 0 ) { // the constants are zero + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + mpz_set_ui (f ,0); + return GENERATION_NOT_CURVE; + } + mpz_divexact (thCste[0] ,thCste[0] ,g); + mpz_divexact (thCste[1] ,thCste[1] ,g); + mpz_divexact (thCste[2] ,thCste[2] ,g); + + + + mpz_gcd (g ,ThCste[0] ,ThCste[1]); + mpz_gcd (g ,g ,ThCste[2]); + if ( mpz_sgn(g) == 0 ) { // the constants are zero + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + mpz_set_ui (f ,0); + return GENERATION_NOT_CURVE; + } + mpz_divexact (ThCste[0] ,ThCste[0] ,g); + mpz_divexact (ThCste[1] ,ThCste[1] ,g); + mpz_divexact (ThCste[2] ,ThCste[2] ,g); + + + + + if ( ( mpz_cmpabs_ui (thCste[0] ,LONG_MAX ) >0 ) || ( mpz_cmpabs_ui (thCste[1] ,LONG_MAX ) >0 ) || ( mpz_cmpabs_ui (thCste[2] ,LONG_MAX ) >0 ) || ( mpz_cmpabs_ui (ThCste[0] ,LONG_MAX ) >0 ) || ( mpz_cmpabs_ui (ThCste[1] ,LONG_MAX ) >0 ) || ( mpz_cmpabs_ui (ThCste[2] ,LONG_MAX ) >0 ) ) { + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + return GENERATION_THESE_SMALL_PARAM_TOO_BIG; + } + + // Now we know that thCste and ThCste fits in long + + cMul->X0 = mpz_get_si (thCste[0]); + cMul->Y0 = mpz_get_si (thCste[1]); + cMul->Z0 = mpz_get_si (thCste[2]); + cMul->T0 = mpz_get_si (thCste[0]); + + + cMul->x0p = mpz_get_si (ThCste[0]); + cMul->y0p = mpz_get_si (ThCste[1]); + cMul->z0p = - mpz_get_si (ThCste[1]); + cMul->t0p = mpz_get_si (ThCste[2]); + + + + + + + + + // Construction of a point on the Kummer surface + + + + mpz_mul (ptCste[0] ,t0 ,sz); + mpz_mul (ptCste[0] ,ptCste[0] ,t5); + mpz_mul (ptCste[0] ,ptCste[0] ,t7); + mpz_neg (ptCste[0] ,ptCste[0]); + // ptCste[0] = - z^2 * (a*z^2 - b*x^2) * (a^2*z^2 - b^2*x^2) * (b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4) + + mpz_mul (t8 ,sz ,sz); + mpz_mul (t9 ,t8 ,a); // z^4*a + mpz_mul (t8 ,a ,b); + mpz_sub (t8 ,sa ,t8); + mpz_add (t8 ,t8 ,sb); // a^2 - a*b + b^2 + mpz_mul (ptCste[1] ,t8 ,t9); // z^4 * a * (a^2 - a*b + b^2) + mpz_mul_ui (ptCste[2] ,a ,3); + mpz_sub (ptCste[2] ,b ,ptCste[2]); // b-3a + mpz_mul (t8 ,ptCste[2] ,sz); // z^2 * (b-3*a) + mpz_mul (t9 ,a ,sx); + mpz_add (t8 ,t8 ,t9); // z^2 * (b-3*a) + a*x^2 + mpz_mul (t9 ,sx ,sb); + mpz_mul (ptCste[2] ,t8 ,t9); // ( z^2 * (b-3*a) + a*x^2 ) * x^2*b^2 + mpz_add (ptCste[2] ,ptCste[2] ,ptCste[1]); + // z^4 * a * (a^2 - a*b + b^2) + ( z^2 * (b-3*a) + a*x^2 ) * x^2*b^2 + // a*b^2*x^4 - 3*a*b^2*x^2*z^2 + b^3*x^2*z^2 + a^3*z^4 - a^2*b*z^4 + a*b^2*z^4 + + mpz_mul (ptCste[1] ,ptCste[2] ,sx); + mpz_mul (ptCste[1] ,ptCste[1] ,sb); + mpz_mul (ptCste[1] ,ptCste[1] ,t2); + mpz_mul (ptCste[1] ,ptCste[1] ,t5); + mpz_neg (ptCste[1] ,ptCste[1]); + // ptCste[1] = - x^2 * (x^2-z^2) * b^2 * (a^2*z^2 - b^2*x^2) * (a*b^2*x^4-3*a*b^2*x^2*z^2+b^3*x^2*z^2+a^3*z^4-a^2*b*z^4+a*b^2*z^4) + + mpz_mul (ptCste[2] ,ptCste[2] ,sx); + mpz_mul (ptCste[2] ,ptCste[2] ,t7); + mpz_neg (ptCste[2] ,ptCste[2]); + // ptCste[2] = - x^2 * (b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4) * (a*b^2*x^4-3*a*b^2*x^2*z^2+b^3*x^2*z^2+a^3*z^4-a^2*b*z^4+a*b^2*z^4) + + + + mpz_gcd (g ,ptCste[0] ,ptCste[1]); + mpz_gcd (g ,g ,ptCste[2]); + if ( mpz_sgn(g) != 0 ) { // g != 0. + mpz_divexact (ptCste[0] ,ptCste[0] ,g); + mpz_divexact (ptCste[1] ,ptCste[1] ,g); + mpz_divexact (ptCste[2] ,ptCste[2] ,g); + } + + if ( ( mpz_sgn(g) != 0 ) && ( mpz_cmpabs_ui (ptCste[0] ,LONG_MAX ) <= 0 ) && ( mpz_cmpabs_ui (ptCste[1] ,LONG_MAX ) <= 0 ) && ( mpz_cmpabs_ui (ptCste[2] ,LONG_MAX ) <= 0 ) ) { + // We have genereted a correct point on the Kummer surface + + + cMul->invX = mpz_get_si (ptCste[0]); + cMul->invY = - mpz_get_si (ptCste[0]); + cMul->invZ = mpz_get_si (ptCste[1]); + cMul->invT = mpz_get_si (ptCste[2]); + + + test = inverseCoorPointKS (f ,n ,P ,cMul); + + } + else { // g=0 or the constants are too large + + + mpz_mul (ptCste[0] ,t0 ,a); + mpz_mul (ptCste[0] ,ptCste[0] ,t5); + mpz_mul (ptCste[0] ,ptCste[0] ,t7); + // ptCste[0] = a * (a*z^2 - b*x^2) * (a^2*z^2 - b^2*x^2) * (b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4) + + mpz_mul (t8 ,sx ,sz); + mpz_mul (t9 ,t8 ,b); // x^2*z^2*b + mpz_mul_ui (ptCste[1] ,sa ,2); + mpz_mul (t8 ,a ,b); + mpz_add (t8 ,t8 ,ptCste[1]); + mpz_sub (t8 ,t8 ,sb); // 2*a^2 + a*b - b^2 + mpz_mul (ptCste[1] ,t9 ,t8); // x^2*z^2*b * (2*a^2 + a*b - b^2) + mpz_mul (t8 ,sz ,sz); + mpz_mul (t9 ,t8 ,a); + mpz_mul (ptCste[2] ,sx ,sx); + mpz_mul (t8 ,ptCste[2] ,b); + mpz_add (t8 ,t8 ,t9); // a*z^4 + b*x^4 + mpz_mul (ptCste[2] ,t8 ,sa); // a^2 * (a*z^4 + b*x^4) + mpz_sub (ptCste[1] ,ptCste[1] ,ptCste[2]); + // x^2*z^2*b * (2*a^2 + a*b - b^2) - a^2 * (a*z^4 + b*x^4) + // - ( a^2*b*x^4 - 2*a^2*b*x^2*z^2 - a*b^2*x^2*z^2 + b^3*x^2*z^2 + a^3*z^4 ) + mpz_mul (ptCste[2] ,ptCste[1] ,b); + //-b*( a^2*b*x^4 - 2*a^2*b*x^2*z^2 - a*b^2*x^2*z^2 + b^3*x^2*z^2 + a^3*z^4 ) + + mpz_mul (ptCste[1] ,ptCste[2] ,sa); + mpz_mul (ptCste[1] ,ptCste[1] ,t2); + mpz_mul (ptCste[1] ,ptCste[1] ,t5); + // ptCste[1] = - a^2 * b * (x^2-z^2) * (a^2*z^2 - b^2*x^2) * (a^2*b*x^4 - 2*a^2*b*x^2*z^2 - a*b^2*x^2*z^2 + b^3*x^2*z^2 + a^3*z^4) + + mpz_mul (ptCste[2] ,ptCste[2] ,t7); + // ptCste[2] = - b * (a^2*z^2 - b^2*x^2) * (a^2*b*x^4 - 2*a^2*b*x^2*z^2 - a*b^2*x^2*z^2 + b^3*x^2*z^2 + a^3*z^4) * (b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4) + + + mpz_gcd (g ,ptCste[0] ,ptCste[1]); + mpz_gcd (g ,g ,ptCste[2]); + if ( mpz_sgn(g) == 0 ) { // g = 0. + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + mpz_set_ui (f ,0); + return GENERATION_NOT_CURVE; + /* + TODO + We could try with other points on the Kummer surface + */ + } + mpz_divexact (ptCste[0] ,ptCste[0] ,g); + mpz_divexact (ptCste[1] ,ptCste[1] ,g); + mpz_divexact (ptCste[2] ,ptCste[2] ,g); + + if ( ( mpz_cmpabs_ui (ptCste[0] ,LONG_MAX ) <= 0 ) && ( mpz_cmpabs_ui (ptCste[1] ,LONG_MAX ) <= 0 ) && ( mpz_cmpabs_ui (ptCste[2] ,LONG_MAX ) <= 0 ) ) { + // We have genereted a good point on the Kummer surface + + + cMul->invX = mpz_get_si (ptCste[0]); + cMul->invY = - mpz_get_si (ptCste[0]); + cMul->invZ = mpz_get_si (ptCste[1]); + cMul->invT = mpz_get_si (ptCste[2]); + + + test = inverseCoorPointKS (f ,n ,P ,cMul); + + } + else { // the constants are too big + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + return GENERATION_THESE_SMALL_PARAM_TOO_BIG; + /* + TODO + We could try with other points on the Kummer surface + */ + } + } + + + if (test == 0) { + // We had a problem during the inversion of the coordinates of the initial + // point + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + return GENERATION_NOT_CURVE; + } + + + + + mpres_t T1,T2,T3,T4,T5,T6,T7; + mpres_t amod,bmod; + mpres_t u,v; + + mpres_init (T1 ,n); + mpres_init (T2 ,n); + mpres_init (T3 ,n); + mpres_init (T4 ,n); + mpres_init (T5 ,n); + mpres_init (T6 ,n); + mpres_init (T7 ,n); + + mpres_init (amod ,n); + mpres_init (bmod ,n); + mpres_init (u ,n); + mpres_init (v ,n); + + mpres_set_z (T1 ,t1 ,n); // T1 = a*b + mpres_set_z (T2 ,t2 ,n); // T2 = x^2 - z^2 + mpres_set_z (T3 ,t3 ,n); // T3 = ( a*z^2 + b*x^2 )^2 + mpres_set_z (T4 ,t4 ,n); // T4 = ( a*z^2 - b*x^2 )^2 + mpres_set_z (T5 ,t5 ,n); // T5 = a^2*z^2 - b^2*x^2 + mpres_set_z (T6 ,t6 ,n); // T6 = a^2*z^4 - 2*b^2*x^2*z^2 + b^2*x^4 + mpres_set_z (T7 ,t7 ,n); + // T7 = b^4*x^2*z^2 - 3*a^2*b^2*x^2*z^2 + a^4*z^4 + a^2*b^2*x^4 + + + mpz_clear (thCste[0]); + mpz_clear (thCste[1]); + mpz_clear (thCste[2]); + mpz_clear (ThCste[0]); + mpz_clear (ThCste[1]); + mpz_clear (ThCste[2]); + mpz_clear (ptCste[0]); + mpz_clear (ptCste[1]); + mpz_clear (ptCste[2]); + mpz_clear (g); + + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t6); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + mpz_clear (t0); + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (sx); + mpz_clear (sz); + + + mpres_set_z (para->y ,y ,n); + mpres_set_z (para->x ,x ,n); + mpres_set_z (u ,z ,n); + if (!mpres_invert (u, u, n)) // u = 1/z + { + mpres_gcd (f, u, n); + + mpres_clear (T1 ,n); + mpres_clear (T2 ,n); + mpres_clear (T3 ,n); + mpres_clear (T4 ,n); + mpres_clear (T5 ,n); + mpres_clear (T6 ,n); + mpres_clear (T7 ,n); + + mpres_clear (amod ,n); + mpres_clear (bmod ,n); + mpres_clear (u ,n); + mpres_clear (v ,n); + + return GENERATION_NOT_CURVE; + } + mpres_mul (para->x ,para->x ,u ,n); // x=x/z + mpres_mul (u ,u ,u ,n); // 1/z^2 + mpres_mul (para->y ,para->y ,u ,n); + + mpres_set_z (amod ,a ,n); + mpres_set_z (bmod ,b ,n); + + if (!mpres_invert (u, T1, n)) // u = 1/(a*b) + { + mpres_gcd (f, T1, n); + + mpres_clear (T1 ,n); + mpres_clear (T2 ,n); + mpres_clear (T3 ,n); + mpres_clear (T4 ,n); + mpres_clear (T5 ,n); + mpres_clear (T6 ,n); + mpres_clear (T7 ,n); + + mpres_clear (amod ,n); + mpres_clear (bmod ,n); + mpres_clear (u ,n); + mpres_clear (v ,n); + + return GENERATION_NOT_CURVE; + } + mpres_mul (para->s ,amod ,u ,n); + mpres_mul (para->s ,para->s ,amod ,n); // s=a/b + mpres_mul (th->be ,bmod ,u ,n); + mpres_mul (th->be ,th->be ,bmod ,n); // be = 1/s = b/a + // On a fini s et be + + mpres_mul (th->t10p ,T1 ,T2 ,n); // a*b*(x^2-z^2) + mpres_mul (th->ga ,u ,T7 ,n); // t7/(a*b) + + mpres_mul (u ,u ,u ,n); // 1/(a*b)^2 + + if (!mpres_invert (cHEll->q ,T2 ,n)) // 1/(x^2-z^2) + { + mpres_gcd (f, T2, n); + + mpres_clear (T1 ,n); + mpres_clear (T2 ,n); + mpres_clear (T3 ,n); + mpres_clear (T4 ,n); + mpres_clear (T5 ,n); + mpres_clear (T6 ,n); + mpres_clear (T7 ,n); + + mpres_clear (amod ,n); + mpres_clear (bmod ,n); + mpres_clear (u ,n); + mpres_clear (v ,n); + + return GENERATION_NOT_CURVE; + } + + mpres_mul (th->ga ,th->ga ,cHEll->q ,n); // t7/(a*b*(x^2-z^2)) + mpres_mul (u ,u ,cHEll->q ,n); // u = 1/( (a*b)^2 * (x^2-z^2) ); + mpres_mul (cHEll->mu ,T7 ,u ,n); // t7/( (a*b)^2 * (x^2-z^2) ); + + mpres_mul (v ,amod ,amod ,n); // v=a^2 + mpres_mul (cHEll->la ,cHEll->mu ,v ,n); // t7/(b^2*(x^2-z^2)); + mpres_mul (cHEll->nu ,v ,u ,n); + mpres_mul (cHEll->nu ,cHEll->nu ,T5 ,n); + mpres_neg (cHEll->nu ,cHEll->nu ,n); + // nu = -(a^2*z^2-b^2*x^2)/(b^2*(x^2-z^2)) + // We finished nu + + mpres_mul (cHEll->mu ,cHEll->mu ,cHEll->q ,n); + // mu = t7 / ( (a*b)^2 * (x^2-z^2)^2 ); + // We finished mu + + + if (!mpres_invert (v ,T5 ,n)) // 1/(a^2*z^2-b^2*x^2) + { + mpres_gcd (f, T5, n); + + mpres_clear (T1 ,n); + mpres_clear (T2 ,n); + mpres_clear (T3 ,n); + mpres_clear (T4 ,n); + mpres_clear (T5 ,n); + mpres_clear (T6 ,n); + mpres_clear (T7 ,n); + + mpres_clear (amod ,n); + mpres_clear (bmod ,n); + mpres_clear (u ,n); + mpres_clear (v ,n); + + return GENERATION_NOT_CURVE; + } + mpres_neg (v ,v ,n); // v = - 1/(a^2*z^2-b^2*x^2) + + mpres_mul (th->ga ,th->ga ,v ,n); // ga=-t7/(a*b*(x^2-z^2)*(a^2*z^2-b^2*x^2)) + mpres_mul (cHEll->la ,cHEll->la ,v ,n); + // la = - t7/(b^2*(x^2-z^2)*(a^2*z^2-b^2*x^2)); + mpres_mul (th->t10p ,th->t10p ,v ,n); + // t10p = -a*b*(x^2-z^2)/(a^2*z^2-b^2*x^2) + // We finished ga, la, t10p + + mpres_set_z (u ,z ,n); + mpres_mul (cHEll->q ,cHEll->q ,u ,n); + mpres_mul (cHEll->q ,cHEll->q ,cHEll->q ,n); + mpres_mul (cHEll->q ,cHEll->q ,u ,n); // z^3 / (x^2-z^2)^2 + mpres_set_z (u ,x ,n); + mpres_mul (cHEll->q ,cHEll->q ,u ,n); + mpres_mul (cHEll->q ,cHEll->q ,para->y ,n); // x*y*z^3 / (x^2-z^2)^2 + mpres_mul (u ,para->s ,para->s ,n); + mpres_sub_ui (u ,u ,1 ,n); // s^2-1 + mpres_mul (cHEll->q ,cHEll->q ,u ,n); + // q = x * y * (a^2-b^2) * z^3 / ( b^2 * (x^2-z^2)^2 ) + // We finished q + + + mpres_mul (u ,T3 ,T4 ,n); + if (!mpres_invert (u ,u ,n)) //u = 1/( (a*z^2 + b*x^2)^2 * (a*z^2 - b*x^2)^2 ) + { + mpres_gcd (f, u, n); + + mpres_clear (T1 ,n); + mpres_clear (T2 ,n); + mpres_clear (T3 ,n); + mpres_clear (T4 ,n); + mpres_clear (T5 ,n); + mpres_clear (T6 ,n); + mpres_clear (T7 ,n); + + mpres_clear (amod ,n); + mpres_clear (bmod ,n); + mpres_clear (u ,n); + mpres_clear (v ,n); + + return GENERATION_NOT_CURVE; + } + mpres_mul (th->p ,T6 ,T6 ,n); + mpres_mul (th->p ,th->p ,u ,n); + mpres_neg (th->p ,th->p ,n); + // p = - (a^2*z^4 - 2*b^2*x^2*z^2 + b^2*x^4) / ( (a*z^2 + b*x^2)^2 * (a*z^2 - b*x^2)^2 ) + // We finished p + + + + + mpres_sub (th->t7p ,th->ga ,th->be ,n); + // We finished t7p + + mpres_mul (th->Rac ,th->be ,cHEll->nu ,n); + mpres_mul (th->Rac ,th->Rac ,th->t7p ,n); + mpres_neg (th->Rac ,th->Rac ,n); + // We finished Rac + + mpres_sub (th->t5p ,th->ga ,th->t10p ,n); + mpres_sub (th->t6p ,th->be ,th->t10p ,n); + // We finished t5p, t6p + + + + + mpres_clear (T1 ,n); + mpres_clear (T2 ,n); + mpres_clear (T3 ,n); + mpres_clear (T4 ,n); + mpres_clear (T5 ,n); + mpres_clear (T6 ,n); + mpres_clear (T7 ,n); + + mpres_clear (amod ,n); + mpres_clear (bmod ,n); + mpres_clear (u ,n); + mpres_clear (v ,n); + + + + return GENERATION_A_CURVE; +} + + + + + + + + + + + + +/* + generate curve with small parameters + We do the special case nJacobi=2 "by hand". It produces enought curves for + finding factors of at least 60 digits + In this case we only need to generate s=a/b since x,y,z are fixed. + We generate the curve one by one such that a+b=c (for constants c) then we go + to the next c + We need a>0, b>0. Moreover s=2,4 doesn't work +*/ +int generateCurveSmallParam (mpz_t f,mpmod_t n, + paraGenCurve para ,thetaCst th, + curveHyperEll cHEll,ksPoint P, + ksSmallConstPourMul cMul, + mpz_t a, mpz_t b, + int nJacobi, + optionsHECM options) { + + int test; + mpz_t x,y,z; + mpres_t u; + mpres_t g; + + mpz_init (x); + mpz_init (y); + mpz_init (z); + mpres_init (u ,n); + mpres_init (g ,n); + + + + + do { + + mulJacobiEntiers (a,b,nJacobi,x,y,z); + + + test = generateOneCurveSmallParam (f,n,para,th,cHEll,P,cMul,a,b,x,y,z); + + if (test == GENERATION_NOT_CURVE) { + mpres_set (g ,f ,n); + if ( mpres_is_zero(g,n) == 1 ) {// f =0 mod n + // Let's try again + test = nextParam (f,n,para,options); + if ( test == NEXT_SMALL_PARAM_TOO_BIG ) { // TODO go to nJacobi=3 + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_PARAM_TOO_BIG; // TODO idem + } + } + else { + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u ,n); + mpres_clear (g ,n); + return GENERATION_FACTOR_FOUND; + } + } + else if (test == GENERATION_THESE_SMALL_PARAM_TOO_BIG) { + // Let's try again + test = nextParam (f,n,para,options); + if ( test == NEXT_SMALL_PARAM_TOO_BIG ) { // TODO go to nJacobi=3 + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_PARAM_TOO_BIG; // TODO cf au idem + } + } + else { // We have genereted a curve + + mpres_sub_ui (u ,cHEll->mu ,1 ,n); + mpres_mul (u ,u ,cHEll->mu ,n); + mpres_mul (u ,u ,th->t7p ,n); + mpres_mul (u ,u ,th->t6p ,n); + + mpres_gcd (f ,u ,n); + mpres_set_z (g ,f ,n); + + if ( !mpz_cmp_ui (f,1) ) { + // general case, f=1, ie non zero modulo all factors of n + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_CORRECT_CURVE; + } + else if ( mpres_is_zero(g,n) != 1 ) { + // f is a real divisor of n + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + else { + // Let's try again + test = nextParam (f,n,para,options); + if ( test == NEXT_SMALL_PARAM_TOO_BIG ) { // TODO go to nJacobi=3 + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_PARAM_TOO_BIG; // TODO idem + } + } + } + } while ( 1 ); + +} + + + + + + + +int generateCurveSmallParamSpecified (mpz_t f,mpmod_t n, + paraGenCurve para,thetaCst th, + curveHyperEll cHEll,ksPoint P, + ksSmallConstPourMul cMul) { + int test; + mpz_t x,y,z; + mpres_t u; + mpres_t g; + + mpz_init (x); + mpz_init (y); + mpz_init (z); + + mpres_init (u,n); + mpres_init (g,n); + + + + mulJacobiEntiers (para->a,para->b,para->nJacobi,x,y,z); + + + + + mpres_set_z (para->s ,para->b ,n); + if (!mpres_invert (para->s ,para->s ,n)) // s=1/b + { + mpres_gcd (f, para->s, n); + mpres_set_z (g ,f ,n); + if ( mpres_is_zero (g,n)==1 ) { // f=0 mod n + mpres_set_ui(para->s ,0 ,n); + + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FAIL; + } + else { + mpres_set_ui(para->s ,0 ,n); + + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + } + mpres_set_z (u ,para->a ,n); + mpres_mul (para->s ,para->s ,u ,n); + + test = generateOneCurveSmallParam (f ,n ,para ,th ,cHEll ,P ,cMul ,para->a ,para->b ,x ,y ,z); + + if (test == GENERATION_NOT_CURVE) { + mpres_set (g ,f ,n); + if ( mpres_is_zero(g,n) == 1 ) {// f =0 mod n + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FAIL; + } + else { + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + } + else if (test == GENERATION_THESE_SMALL_PARAM_TOO_BIG) { + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FAIL; + } + else { // We have genereted a curve + + mpres_sub_ui (u ,cHEll->mu ,1 ,n); + mpres_mul (u ,u ,cHEll->mu ,n); + mpres_mul (u ,u ,th->t7p ,n); + mpres_mul (u ,u ,th->t6p ,n); + + mpres_gcd (f ,u ,n); + mpres_set_z (g ,f ,n); + + if ( !mpz_cmp_ui (f,1) ) { + // general case, f=1, ie non zero modulo all factors of n + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_CORRECT_CURVE; + } + else if ( mpres_is_zero(g,n) != 1 ) { + // f is a real divisor of n + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FACTOR_FOUND; + } + else { + mpz_clear (x); + mpz_clear (y); + mpz_clear (z); + mpres_clear (u,n); + mpres_clear (g,n); + return GENERATION_FAIL; + } + } +} + + + + + + +int nextSmallParam (mpz_t a, mpz_t b, + const long Ha,const long Hb) { + + mpz_t g; + mpz_init (g); + + do { + if ( mpz_cmp (a,b) < 0 ){ + if ( mpz_cmp_ui (a,Ha) < 0 ) { + mpz_add_ui (a ,a ,1); // a++ + } + else if ( mpz_cmp_ui (b,Hb) < 0 ) { + mpz_add_ui (b ,b ,1); // b++ + mpz_set_ui (a,1); // a=1 + } + else { + mpz_clear (g); + return NEXT_SMALL_PARAM_TOO_BIG; + } + } + else if ( mpz_cmp_ui (b,1) > 0) { + mpz_sub_ui (b ,b ,1); // b-- + } + else { + if ( mpz_cmp_ui (a,Hb) < 0) { + mpz_add_ui (b ,a ,1); // b=a+1 + mpz_set_ui (a ,1); // a=1 + } + else if ( mpz_cmp_ui (a,Ha) < 0 ) { + mpz_add_ui (a ,a ,1); // a++ + mpz_set_ui (b ,Hb); // b=Hb + } + else { + mpz_clear (g); + return NEXT_SMALL_PARAM_TOO_BIG; + } + } + mpz_gcd (g ,a ,b); + } while ( mpz_cmp_ui (g,1) != 0 ); // while g != 1 + + mpz_clear (g); + return NEXT_PARAM_CAN_BE_USED; + +} + + + +int nextParam (mpz_t f,mpmod_t n, + paraGenCurve para, + optionsHECM options) { + int test; + + if ( options->smallParam == TRUE ) { + + long Ha,Hb; + mpz_t t1; + mpz_init (t1); + + if (para->nJacobi==2) { + mpz_set_ui (t1 ,LONG_MAX); + mpz_mul_ui (t1 ,t1 ,4); + mpz_root (t1 ,t1 ,5); + Ha = mpz_get_ui (t1); // Ha = 1600 + Hb = Ha; // Hb = 1000 + } + else if (para->nJacobi==3) { + mpz_set_ui (t1 ,LONG_MAX); + mpz_mul_ui (t1 ,t1 ,3194512); // *2^15*3^8/673 + mpz_root (t1 ,t1 ,8); + Hb = mpz_get_ui (t1); // Hb = 200 + Ha = Hb*3; // Hb*673,2^(1/8) // Ha = 100 + } + else if (para->nJacobi==4) { + mpz_set_ui (t1 ,LONG_MAX); + mpz_mul_ui (t1 ,t1 ,65536); + mpz_mul_ui (t1 ,t1 ,112101); // *2^16*3^15/128 + mpz_root (t1 ,t1 ,12); + Hb = mpz_get_ui (t1); // Hb = 30 + Ha = Hb*2; // Hb*128^(1/8) // Ha = 20 + } + else { + // TODO by default we take the same bound than for nJacobi = 4 + // What is the real bound? + mpz_set_ui (t1 ,LONG_MAX); + mpz_mul_ui (t1 ,t1 ,65536); + mpz_mul_ui (t1 ,t1 ,112101); + mpz_root (t1 ,t1 ,12); + Hb = mpz_get_ui (t1); + Ha = Hb*2; + } + + + test = nextSmallParam (para->a,para->b,Ha,Hb); + + if (test==NEXT_PARAM_CAN_BE_USED) { + mpz_clear (t1); + return NEXT_PARAM_CAN_BE_USED; + } + else { // test==NEXT_SMALL_PARAM_TOO_BIG + if ( para->nJacobi < N_JACOBI_SMALL_PARAM_MAX ) { + para->nJacobi++; + mpz_set_ui (para->a ,1); + mpz_set_ui (para->b ,2); + mpz_clear (t1); + return NEXT_PARAM_CAN_BE_USED; + } + else { + mpz_clear (t1); + return NEXT_SMALL_PARAM_TOO_BIG; + } + } + } + else { // (options->smallParam == FALSE ) + para->nJacobi = (rand() % (N_JACOBI_MAX_p1 - N_JACOBI_MIN)) + N_JACOBI_MIN; + mpres_add_ui (para->s ,para->s ,1 ,n); + return NEXT_PARAM_CAN_BE_USED; + } + +} diff -Nru gmp-ecm-7.0.4+ds/hecm/generation.h gmp-ecm-7.0.5+ds/hecm/generation.h --- gmp-ecm-7.0.4+ds/hecm/generation.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/generation.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,115 @@ +#ifndef _GENERATION__H +#define _GENERATION__H + +#include "../ecm-impl.h" + +#include "Jacobi.h" +#include "auxi.h" + + +#define GENERATION_THESE_SMALL_PARAM_TOO_BIG -1 + // the parameters that were used, give constants that are too big +#define GENERATION_NOT_CURVE 0 + // There is no curve with these (fixed) parameters +#define GENERATION_A_CURVE 1 + // A curve is generated with these fixed parameters +#define GENERATION_CORRECT_CURVE 2 + // The generated curve can be used +#define GENERATION_FACTOR_FOUND 3 + // We found a factor +#define GENERATION_FAIL 4 + // The generation have failed (maybe the generated curve can't be used) +#define GENERATION_PARAM_TOO_BIG 5 + // There is no more curve with small enough parameters + + +#define POW_MAX 4 // nJacobi < 2^5 +#define N_JACOBI_MIN 0 +#define N_JACOBI_MAX_p1 (1< +#include +#include + +#include "../ecm-impl.h" + +#include "hecm.h" +#include "auxi.h" +#include "generation.h" +#include "morphismes.h" + + + + + + +void optionsHECM_init (optionsHECM options) { + mpz_init (options->heightMin); + mpz_init (options->heightMax); + + options->smallParam = TRUE; + options->curveSpecified = FALSE; + options->initialCurveSpecified = FALSE; + options->verbose = OUTPUT_NORMAL; + + options->nbtests = 1; + mpz_set_ui (options->heightMin,0); + mpz_set_ui (options->heightMax,0); +} + + +void optionsHECM_clear (optionsHECM options) { + mpz_clear (options->heightMin); + mpz_clear (options->heightMax); +} + + +void printCurve (paraGenCurve para,optionsHECM options,mpmod_t n) { + mpz_t s; + mpz_init (s); + + mpres_get_z (s ,para->s ,n); + + if (options->smallParam == TRUE) { + gmp_printf ("s=%Zd/%Zd",para->a,para->b); + } + else { + gmp_printf("s=%Zd",s ); + } + + printf(" and nJacobi=%d\n",para->nJacobi); + + mpz_clear (s); +} + + + +static void +usage (void) +{ + printf ("Perform ECM with hyperelliptic curves of genus 2. The hyperelliptic curves are\n"); + printf ("(2,2)-decomposable which means that their jacobian is isogenous to the product\n"); + printf ("of two elliptic curves. Therefore one run of HECM with this kind of curves is\n"); + printf ("equivalent to two runs of ECM on the underlying curves.\n"); + printf ("Use Kummer surfaces to speed up the arithmetics in the Jacobians.\n"); + printf ("The use of small parameters makes it quicker than GMP-ECM for numbers of\n"); + printf ("at least 300 digits.\n"); + + + printf ("\nParameters:\n"); + printf ("-n number to be factored in base 10\n"); + printf ("-B1 B1 in base 10\n"); + printf ("-B2 B2 in base 10\n"); + printf ("-c Numbers of tests to do. If unspecified run only once.\n"); + printf (" If 0 run until it finds a factor.\n"); + printf ("-v verbose mode\n"); + + printf ("-not_low_param Work with normal parameters (not small)\n"); + + + printf ("-spe We try to generate the curve with this fixed parameters.\n"); + printf (" One of the following options should have been specified:\n"); + printf (" -ab if we want small parameters\n"); + printf (" -s if we want normal parametrization\n"); + printf (" If the generated curve is correct we run HECM with it else we print\n"); + printf (" an error\n"); + + printf ("-ab a b fix the parameter s=a/b. We try to use small parameters if possible\n"); + printf ("-h_min minimal height of s. If equal to 0 or unspecified, then we begin\n"); + printf (" by 1\n"); + printf ("-h_max maximal height of s. If equal to 0 or unspecified, then no limit\n"); + + printf ("-nJacobi For the parametrization, we need to have a point on a Jacobi\n"); + printf (" curve. We begin by P=(1,*) and we use the point nJacobi*P=(x,y)\n"); + printf (" If nJacobi<0, we use |nJacobi| (absolute value)\n"); + printf (" If nJacobi=1, we use nJacobi=2 (first usable point)\n"); + printf (" If nJacobi=0 or unspecified, and in the case of small parameters,\n"); + printf (" we use nJacobi=2 and increase first a,b. If we can't increase a,b\n"); + printf (" we increase nJacobi by 1...\n"); + printf (" If nJacobi=0 or unspecified, and in the case of normal\n"); + printf (" parametrization, we take random nJacobi for each run.\n"); + + printf ("-s s fix the parameter s for normal parametrization\n"); + printf ("-seed fix the seed. Only useful with normal parametrization\n"); + + + + printf ("-h, --help Prints this help and exit.\n"); +} + + + + +double B1 = 25.0; +char * number = "4816415081"; // 58027 83003 +char * Nsigma = "2"; // s=2 +int seed = 0; +char * charB2 = "-1"; + +int main(int argc, char * argv[]) { + + + clock_t + temps_initial, // initial time in micro-seconds + temps_final; // final time in micro-seconds + double + temps_cpu; // total time in seconds + + temps_cpu=0; + temps_initial = clock (); + + + optionsHECM options; + optionsHECM_init (options); + + mpmod_t n; + mpz_t k; + mpz_t f; // factor of N + mpz_t s; // The parameter s in mpz_t + paraGenCurve para; + curve T1,T2; + int test,test2; + unsigned int nb; + double B2scale = 1.0; + mpz_t B2; + + char *a, *b, *heightMax, *heightMin; + + para->nJacobi = 2; + a="1";b="2"; + mpz_init_set_si (B2,ECM_DEFAULT_B2); // set B2 + + + while ((argc > 1) && (argv[1][0] == '-')) { + if ((argc > 2) && (strcmp (argv[1], "-n")) == 0) { + number = argv[2]; + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-s")) == 0) { + Nsigma = argv[2]; + options->initialCurveSpecified = TRUE; + options->smallParam = FALSE; + argv += 2; + argc -= 2; + } + else if (strcmp (argv[1], "-h") == 0 || strcmp (argv[1], "--help") == 0) { + usage (); + mpz_clear(B2); + exit (0); + } + else if ((argc > 3) && (strcmp (argv[1], "-ab")) == 0) { + a = argv[2]; + b = argv[3]; + options->initialCurveSpecified = TRUE; + options->smallParam = TRUE; + argv += 3; + argc -= 3; + } + else if ((argc > 2) && (strcmp (argv[1], "-nJacobi")) == 0) { + para->nJacobi = atoi(argv[2]); + options->initialCurveSpecified = TRUE; + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-B1")) == 0) { + B1 = atof(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-B2")) == 0) { + mpz_set_str (B2,argv[2],10); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-seed")) == 0) { + seed = atoi(argv[2]); + argv += 2; + argc -= 2; + } + else if (strcmp (argv[1], "-not_low_param") == 0) { + options->smallParam = FALSE; + argv ++; + argc --; + } + else if (strcmp (argv[1], "-spe") == 0) { + options->initialCurveSpecified = TRUE; + options->curveSpecified = TRUE; + options->nbtests = 1; + argv ++; + argc --; + } + else if (strcmp (argv[1], "-v") == 0) { + options->verbose ++; + argv ++; + argc --; + } + else if ((argc > 2) && (strcmp (argv[1], "-c")) == 0) { + options->nbtests = atoi(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-h_max")) == 0) { + heightMax = argv[2]; + mpz_set_str (options->heightMax ,heightMax ,10); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-h_min")) == 0) { + heightMin = argv[2]; + mpz_set_str (options->heightMin ,heightMin ,10); + argv += 2; + argc -= 2; + } + else { + fprintf(stderr, "Unexpected option: %s\n", argv[1]); + mpz_clear(B2); + exit(1); + } + } + + + + if (seed == 0) { + srand(time(NULL)); + } else { + srand(seed); + } + + + mpz_t N; + mpz_init_set_str (N,number,10); // number in base 10 + gmp_printf ("Input number is %Zd\n",N); + mpmod_init(n,N,ECM_MOD_DEFAULT); + mpz_clear (N); + + + + + if ( mpz_sgn (options->heightMax) ==0 ) { // heightMax = 0 + mpz_set (options->heightMax,n->orig_modulus); // heightMax = n + } + if ( mpz_cmp (options->heightMin ,options->heightMax) > 0 ) { + gmp_printf ("We must have h_min <= h_max. Here we have %Zd > %Zd.\n",options->heightMin,options->heightMax); + + optionsHECM_clear (options); + mpmod_clear(n); + mpz_clear (B2); + return 0; + } + + + + + + paraGenCurve_init (para,n); + mpz_init(s); + + + if (options->initialCurveSpecified == TRUE) { + + if (options->smallParam == TRUE) { + mpz_set_str (para->a ,a ,10); + mpz_set_str (para->b ,b ,10); + + if ( mpz_cmp_ui (para->b,0) == 0 ) { // b=0 + printf("We have s=a/b so b must be non zero.\n"); + + optionsHECM_clear (options); + mpz_clear(s); + paraGenCurve_clear (para,n); + mpmod_clear(n); + mpz_clear (B2); + return 0; + } + + mpz_t d; + mpz_init (d); + + mpz_gcd (d ,para->a ,para->b); + mpz_divexact (para->a ,para->a ,d); + mpz_divexact (para->b ,para->b ,d); + + mpz_clear (d); + + if ( ( mpz_cmp (para->a,options->heightMax) > 0 ) || ( mpz_cmp (para->b,options->heightMax) > 0 ) ) { + printf("The height of the initial parameters is too big.\n"); + + optionsHECM_clear (options); + mpz_clear(s); + paraGenCurve_clear (para,n); + mpmod_clear(n); + mpz_clear (B2); + return 0; + } + + + } + else { + mpz_set_str (s,Nsigma,10); + mpres_set_z (para->s,s,n); + + if ( ( mpz_cmp (s,options->heightMax) > 0 ) ) { + printf("The height of the initial parameter is too big.\n"); + + + optionsHECM_clear (options); + mpz_clear(s); + paraGenCurve_clear (para,n); + mpmod_clear(n); + mpz_clear (B2); + return 0; + } + + } + } + else { + if (options->smallParam == TRUE) { + if (mpz_cmp_ui (options->heightMax,2) < 0) { + printf("There is no hyperelliptic curve with parameters smaller than this height.\n"); + + optionsHECM_clear (options); + mpz_clear(s); + paraGenCurve_clear (para,n); + mpmod_clear(n); + mpz_clear (B2); + return 0; + } + + if ( mpz_cmp_ui (options->heightMin,2) < 0 ) { + mpz_set_ui (para->a ,1); + mpz_set_ui (para->b ,2); + } + else { + mpz_set_ui (para->a ,1); + mpz_set (para->b ,options->heightMin); + } + para->nJacobi = 2; + + } + else { + if (mpz_cmp_ui (options->heightMax,3) < 0) { + printf("There is no hyperelliptic curve with parameters smaller than this height.\n"); + + optionsHECM_clear (options); + mpz_clear(s); + paraGenCurve_clear (para,n); + mpmod_clear(n); + mpz_clear (B2); + return 0; + } + + para->nJacobi= (rand() % (N_JACOBI_MAX_p1 - N_JACOBI_MIN)) + N_JACOBI_MIN; + // TODO + if ( mpz_cmp_ui (options->heightMin,3) < 0 ) { + mpres_set_ui (para->s ,3 ,n); + } + else { + mpres_set_z (para->s ,options->heightMin ,n); + } + + } + } + + + + + + + + if (options->nbtests <= 0) { + options->nbtests = UINT_MAX; + } + + + + mpres_init (T1.x,n); + mpres_init (T1.y,n); + mpres_init (T1.A,n); + mpres_init (T2.x,n); + mpres_init (T2.y,n); + mpres_init (T2.A,n); + + mpz_init(f); + + + + mpz_init(k); + prodTreeCalculk(k,B1); // k=lcm(2,3,..,B1) + getprime_clear (); + + + + + + + + if ( ECM_IS_DEFAULT_B2(B2) ) { + mpz_set_d (B2, B2scale * pow (ECM_COST * B1, DEFAULT_B2_EXPONENT)); + gmp_printf ("Using B1=%lu and B2=%Zd\n",(long) B1,B2); + } + + + + + + + // loop on hecm + for(nb=1 ; nb <= options->nbtests ; nb++) { + printf("Run %u on %u\n",nb,options->nbtests); + + + + // stage 1 + if ( options->smallParam == TRUE) { + test = hecm1LowParam (f ,n ,k ,para ,&T1,&T2 ,options); + } + else { + test = hecm1Normal (f ,n ,k ,para ,&T1,&T2 ,options); + } + + if ( options->verbose >= OUTPUT_VERBOSE ) { + // we need the parameters of the curve. + printf ("The curve is given by "); + printCurve (para,options,n); + } + + if (test != HECM_NO_FACTOR_FOUND) { + break; + } + + + + + // stage 2 + test = hecm2 (f,n,&T1,&T2,B2); + if (test != HECM_NO_FACTOR_FOUND) { + break; + } + + + + // find parameters for the next curve + test2 = nextParam (f,n,para,options); + if (test2 == NEXT_SMALL_PARAM_TOO_BIG) { + test = HECM_PARAM_TOO_BIG; + break; + } + else if (test2 == NEXT_PARAM_ERROR) { + test = HECM_ERROR; + break; + } + + } + + + + + if (nb>options->nbtests) { + nb = options->nbtests; + } + + if ( ( para->nJacobi == 0 ) || ( para->nJacobi == 1 ) ) { + para->nJacobi = 2; + } + + + + if (test == HECM_ERROR) { + printf("An error occured with the curve:\t" ); + printCurve(para,options,n); + printf("It could be a multiplication by 0 in the projective space.\n" ); + } + else if (test == HECM_GENERATION_FAIL) { + printf("The generation of the curve failled. The parameters were:\t"); + printCurve(para,options,n); + } + else if (test == HECM_NO_FACTOR_FOUND) { + printf("We test %u hyperelliptic curves (i.e. %u elliptic curves)\n",options->nbtests,options->nbtests*2); + printf("without finding factor of n.\n" ); + } + else if (test == HECM_PARAM_TOO_BIG) { + printf("There is no more curves with low parameters height.\n"); + printf("We test %u hyperelliptic curves (i.e. %u elliptic curves)\n",nb,nb*2); + printf("without finding factor of n.\n" ); + } + else if (test == HECM_FOUND_N ) { + printf("We divided by n\n"); + printf("We are on the curve of parameters:\t"); + printCurve(para,options,n); + printf("We test %u hyperelliptic curves (i.e. %u elliptic curves)\n",nb,nb*2); + } + else if (test == HECM_FOUND_ZERO_CURVE_1 ) { + //TODO in that case, in general, we haven't check if we had the zero of the second curve + printf ("k*P is the zero of the first elliptic curve modulo all the factors of n\n"); + printf("We are on the hyperelliptic curve of parameters:\t"); + printCurve(para,options,n); + printf("We test %u hyperelliptic curves (i.e. %u elliptic curves)\n",nb,nb*2); + } + else if (test == HECM_FOUND_ZERO_CURVE_2 ) { + printf ("k*P is the zero of the second elliptic curve modulo all the factors of n\n"); + printf("We are on the hyperelliptic curve of parameters:\n"); + printf("\t"); + printCurve(para,options,n); + printf("We test %u hyperelliptic curves (i.e. %u elliptic curves)\n",nb,nb*2); + } + else if (test == HECM_FOUND_ZERO_CURVE_1_AND_2 ) { + printf ("k*P is the zero of both elliptic curve modulo all the factors of n\n"); + printf("We are on the hyperelliptic curve of parameters:\t"); + printCurve(para,options,n); + printf("We test %u hyperelliptic curves (i.e. %u elliptic curves)\n",nb,nb*2); + } + else { // We found a real factor of n + gmp_printf("A factor of n is %Zd\n",f); + if ( options->verbose >= OUTPUT_VERBOSE ) { + if (test == HECM_FACTOR_FOUND_MORPHISM) { + printf("It was found during the computation of the morphisms.\n"); + } + else if (test == HECM_FACTOR_FOUND_GENERATION) { + printf("It was found during the generation of the curve.\n"); + } + else if (test == HECM_FACTOR_FOUND_STEP1) { + printf("It was found in stage 1.\n"); + } + else if (test == HECM_FACTOR_FOUND_STEP2) { + printf("It was found in stage 2.\n"); + } + } + printf("It was found by using the parameters:\t"); + printCurve(para,options,n); + printf("We used %u hyperelliptic curves (i.e. %u elliptic curves).\n",nb,nb*2); + + } + + + optionsHECM_clear (options); + mpz_clear(s); + mpz_clear(k); + mpz_clear(f); + paraGenCurve_clear (para,n); + mpres_clear (T1.x,n); + mpres_clear (T1.y,n); + mpres_clear (T1.A,n); + mpres_clear (T2.x,n); + mpres_clear (T2.y,n); + mpres_clear (T2.A,n); + mpmod_clear(n); + mpz_clear (B2); + + temps_final = clock (); + temps_cpu = (temps_final - temps_initial); + printf("The program took %g seconds.\n",temps_cpu/CLOCKS_PER_SEC); + + + return 1; +} diff -Nru gmp-ecm-7.0.4+ds/hecm/hecm.h gmp-ecm-7.0.5+ds/hecm/hecm.h --- gmp-ecm-7.0.4+ds/hecm/hecm.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/hecm.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,49 @@ +#ifndef _HECM_H +#define _HECM_H + +#include "../ecm-impl.h" + +#define HECM_FOUND_N -2 +#define HECM_ERROR -1 +#define HECM_NO_FACTOR_FOUND 0 +#define HECM_FACTOR_FOUND_STEP1 1 +#define HECM_FACTOR_FOUND_STEP2 2 +#define HECM_FACTOR_FOUND_GENERATION 3 +#define HECM_FACTOR_FOUND_MORPHISM 4 +#define HECM_FOUND_ZERO_CURVE_1 5 +#define HECM_FOUND_ZERO_CURVE_2 6 +#define HECM_FOUND_ZERO_CURVE_1_AND_2 7 +#define HECM_GENERATION_FAIL 8 +#define HECM_PARAM_TOO_BIG 9 + +#define TRUE 1 +#define FALSE 0 + + +struct optionsHECM_s { + int smallParam; // use small parameters (default = TRUE) + int curveSpecified; // use a specified curve + int initialCurveSpecified; // the first curve is specified + int verbose; // set the verbose mode + unsigned int nbtests; // number of tests to do + mpz_t heightMin; // minimal height for the parameters + mpz_t heightMax; // maximal height for the parameters +}; +typedef struct optionsHECM_s optionsHECM[1]; + +void optionsHECM_init (optionsHECM options); +void optionsHECM_clear (optionsHECM options); + +#include "generation.h" + +/* stage 1 */ +int hecm1Normal (mpz_t f ,mpmod_t n ,mpz_t k ,paraGenCurve para ,curve *T1,curve *T2,optionsHECM options); +int hecm1LowParam (mpz_t f ,mpmod_t n ,mpz_t k ,paraGenCurve para ,curve *T1,curve *T2,optionsHECM options); + +/* stage 2 */ +int ecmfactor2 (mpz_t f, mpz_t n, mpz_t A, mpz_t x, mpz_t y, mpz_t B2); +int hecm2 (mpz_t f, mpmod_t n, curve* T1, curve* T2, mpz_t B2); + + + +#endif diff -Nru gmp-ecm-7.0.4+ds/hecm/Jacobi.c gmp-ecm-7.0.5+ds/hecm/Jacobi.c --- gmp-ecm-7.0.4+ds/hecm/Jacobi.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/Jacobi.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,460 @@ +#include "Jacobi.h" + + + + + +void coorJacobi_init (coorJacobi P, mpmod_t n ) { + mpres_init (P->U ,n); + mpres_init (P->V ,n); + mpres_init (P->W ,n); + mpres_init (P->Y ,n); +} + +void coorJacobi_clear (coorJacobi P, mpmod_t n ) { + mpres_clear (P->U ,n); + mpres_clear (P->V ,n); + mpres_clear (P->W ,n); + mpres_clear (P->Y ,n); +} + +void coorJacobi_set (coorJacobi P,coorJacobi Q, mpmod_t n ) { + mpres_set (P->U ,Q->U ,n); + mpres_set (P->V ,Q->V ,n); + mpres_set (P->W ,Q->W ,n); + mpres_set (P->Y ,Q->Y ,n); +} + + + +/* + Multiply a point by k on a Jacobi elliptic curve over ZZ (use mpz_t) + The curve is y^2 = 1 + (-3/s^2+1/s^4)*x^2 + x^4/s^2 + The initial point is X=1, Y=1-ep (Z=1) + The result is put in (x,y,z) + NOTE: for k=0 or k=1 we take k=2! +*/ +void mulJacobiEntiers (mpz_t a,mpz_t b, + int k, + mpz_t x,mpz_t y,mpz_t z) { + + + if (k<0) { + k = -k; + } + + if (k==0) { + mpz_set_ui (x,0); + mpz_set_ui (y,1); + mpz_set_ui (z,1); + } + else if (k==1) { + mpz_mul (x ,a ,a); + mpz_mul (z ,b ,b); + mpz_sub (y ,x ,z); + + mpz_set (x,a); + mpz_set (z,a); + + } + else if (k ==2) { + mpz_mul (y ,b ,b); + mpz_mul_ui (y ,y ,2); + mpz_mul (z ,a ,a); + mpz_add (y ,y ,z); // y = a^2 + 2*b^2 + mpz_mul_ui (x ,a ,2); // x = 2*a + mpz_set (z ,a); // z = a + + + } + else { // general case + mpz_t sa,sb,Yi; + mpz_t X,Y,Z; + mpz_t t1,t2,t3,t4,t5,t7,t8,t9; + int i; + + mpz_init (sa); + mpz_init (sb); + mpz_init (Yi); + mpz_init (X); + mpz_init (Z); + mpz_init (Y); + mpz_init (t1); + mpz_init (t2); + mpz_init (t3); + mpz_init (t4); + mpz_init (t5); + mpz_init (t7); + mpz_init (t8); + mpz_init (t9); + + mpz_mul (sa ,a ,a); + mpz_mul (sb ,b ,b); + mpz_sub (Yi ,sa ,sb); + + mpz_set (X ,a); + mpz_set (Z ,a); + mpz_set (Y ,Yi); + + for (i=2;i<=k;i++) { + mpz_mul (t1 ,X ,Z); + mpz_mul (t2 ,X ,X); + mpz_mul (t3 ,Z ,Z); + mpz_mul (t4 ,t3 ,sa); + mpz_mul (t5 ,t2 ,sb); + + + mpz_add (t9 ,t2 ,t3); + mpz_mul (t9 ,t9 ,sa); + mpz_mul (t9 ,t9 ,sb); + mpz_mul (t9 ,t9 ,t1); + mpz_mul_ui (t9 ,t9 ,2); + mpz_mul (t9 ,t9 ,sa); + + mpz_mul (t8 ,Yi ,Y); + mpz_mul (t8 ,t8 ,sa); + + mpz_mul_ui (t7 ,sa ,3); + mpz_sub (t7 ,sb ,t7); + mpz_mul (t7 ,t7 ,sb); + mpz_mul (t7 ,t7 ,t1); + + mpz_add (t8 ,t8 ,t7); + + mpz_add (t7 ,t4 ,t5); + mpz_mul (t8 ,t8 ,t7); + mpz_add (t9 ,t9 ,t8); + + + mpz_sub (t8 ,t4 ,t5); + mpz_mul (t8 ,t8 ,a); + + + mpz_mul (t7 ,t1 ,Yi); + mpz_mul (t2 ,sa ,Y); + mpz_add (t7 ,t7 ,t2); + mpz_mul (t7 ,t7 ,a); + + mpz_set (X ,t7); + mpz_set (Y ,t9); + mpz_set (Z ,t8); + } + mpz_set (x ,X); + mpz_set (y ,Y); + mpz_set (z ,Z); + + + mpz_clear (sa); + mpz_clear (sb); + mpz_clear (Yi); + mpz_clear (X); + mpz_clear (Z); + mpz_clear (Y); + mpz_clear (t1); + mpz_clear (t2); + mpz_clear (t3); + mpz_clear (t4); + mpz_clear (t5); + mpz_clear (t7); + mpz_clear (t8); + mpz_clear (t9); + } + + +} + + + + +/* + Multiply a point by k on a Jacobi elliptic curve + The curve is y^2 = 1 + (-3/s^2+1/s^4)*x^2 + x^4/s^2 + The initial point is X=1, Y=1-ep (Z=1) + The result is put in x,y + NOTE: for k=0 or k=1 we take k=2! +*/ +int mulJacobi2 (mpz_t f,mpmod_t n, + int k, + mpres_t x,mpres_t y, + mpres_t Y,mpres_t ep,mpres_t dep) { + + + if (k <= 2) { + + doubleJacobi2DebFin(f,n,x,y,Y,ep,dep); + return MULT_JACOBI; + + } + else if (k == 3) { + + coorJacobi P,Q; + mpres_t t; + + mpres_init (t,n); + coorJacobi_init (P,n); + coorJacobi_init (Q,n); + + mpres_set_ui (P->U,1,n); + mpres_set_ui (P->V,1,n); + mpres_set_ui (P->W,1,n); + mpres_set (P->Y,Y,n); + + doubleJacobi2Deb(n,Q,Y,ep,dep); + addJacobi2fin(n,Q,P,x,y,t,ep,dep); + + if (!mpres_invert (t,t,n)) // t=1/Z3 + { + mpres_gcd (f, t, n); + + mpres_clear (t,n); + coorJacobi_clear (P,n); + coorJacobi_clear (Q,n); + + return MULT_JACOBI_FAIL; + } + + mpres_mul (x ,x ,t ,n); + mpres_mul (t ,t ,t ,n); + mpres_mul (y ,y ,t ,n); + + mpres_clear (t,n); + coorJacobi_clear (P,n); + coorJacobi_clear (Q,n); + + return MULT_JACOBI; + + } + else { + + coorJacobi P,Q; + mpres_t t; + int mask; + + mpres_init (t,n); + coorJacobi_init (P,n); + coorJacobi_init (Q,n); + + mpres_set_ui (P->U,1,n); + mpres_set_ui (P->V,1,n); + mpres_set_ui (P->W,1,n); + mpres_set (P->Y,Y,n); + + + mask = (1<> 1); + } // mask = 100... with the same number of bits than k + mask = (mask >> 1);// mask = 010... with the same number of bits than k + + + doubleJacobi2Deb(n,Q,Y,ep,dep); // Q = 2*P + if ( (mask & k) != 0 ) { // case Q+2*Q + addJacobi2(n,P,Q,Q,ep,dep); + } + mask = (mask >> 1);// mask = 0010... with the same number of bits than k + + while (mask > 1) { + addJacobi2(n,Q,Q,Q,ep,dep); + if ( (mask & k) != 0 ) { // case Q+2*Q + addJacobi2(n,P,Q,Q,ep,dep); + } + mask = (mask >> 1); + } + + + if ( (mask & k) == 0 ) { // case 2*Q + coorJacobi_set (P ,Q ,n); + addJacobi2fin(n,Q,P,x,y,t,ep,dep); + } + else { // case 2*Q+Q + addJacobi2(n,Q,Q,Q,ep,dep); + addJacobi2fin(n,Q,P,x,y,t,ep,dep); + } + + + if (!mpres_invert (t,t,n)) // t=1/Z3 + { + mpres_gcd (f, t, n); + + mpres_clear (t,n); + coorJacobi_clear (P,n); + coorJacobi_clear (Q,n); + + return MULT_JACOBI_FAIL; + } + + mpres_mul (x ,x ,t ,n); + mpres_mul (t ,t ,t ,n); + mpres_mul (y ,y ,t ,n); + + mpres_clear (t,n); + coorJacobi_clear (P,n); + coorJacobi_clear (Q,n); + + return MULT_JACOBI; + + } + +} + + + + +/* + Double a point on the Jacobi curve y^2=1-dep*X^2+ep*X^4 + begin with X=1, Y=1-ep (Z=1) + We want x,y +*/ +void doubleJacobi2DebFin(mpz_t f,mpmod_t n, + mpres_t x,mpres_t y, + mpres_t Y,mpres_t ep,mpres_t dep) { + + + + + mpres_set_ui (x ,2 ,n); // x=2 + + mpres_mul_ui (y ,ep ,2 ,n); + mpres_add_ui (y ,y ,1 ,n); // y = 2*ep+1 = 2/s^2+1 + + +} + + + + +/* + Double a point on the Jacobi curve y^2=1-dep*X^2+ep*X^4 + begin with X=1, Y=1-ep (Z=1) + We want P2=(U3,V3,W3,Y3) +*/ +void doubleJacobi2Deb(mpmod_t n, + coorJacobi P2, + mpres_t Y,mpres_t ep,mpres_t dep) { + + mpres_add_ui (P2->V ,ep, 1 ,n); // V3 = ep+1 + mpres_mul (P2->W ,Y ,Y ,n); // W3 = Y^2 + mpres_sub (P2->Y ,P2->W ,dep ,n); // Y3=Y^2-dep + mpres_mul (P2->Y ,P2->Y ,P2->V ,n); // Y3=(1+ep)*(Y^2-dep) + mpres_mul_ui (P2->V ,ep ,4 ,n); + mpres_add (P2->Y ,P2->Y ,P2->V ,n); // Y3=(1+ep)*(Y^2-dep) + 4ep + + mpres_mul_ui (P2->V ,P2->W ,2 ,n); // V3 = 2*Y^2 + mpres_mul_ui (P2->U ,P2->V ,2 ,n); // U3 = 4*Y^2 + +} + + + +/* + add two points on the jacobi curve y^2=1-dep*X^2+ep*X^4 + Initial points P1=(U1,V1,W1,Y1) and P2=(U2,V2,W2,Y2) + P1+P2=P3=(U3,V3,W3,Y3) +*/ +void addJacobi2(mpmod_t n, + coorJacobi P1,coorJacobi P2, + coorJacobi P3, + mpres_t ep,mpres_t dep) { + + mpres_t t1,t3,t5,t7,t9; + + mpres_init (t1,n); + mpres_init (t3,n); + mpres_init (t5,n); + mpres_init (t7,n); + mpres_init (t9,n); + + + + mpres_set (t1 ,P1->U ,n); + mpres_set (P3->U ,P2->U ,n); + mpres_set (t3 ,P1->V ,n); + mpres_set (P3->V ,P2->V ,n); + mpres_set (t5 ,P1->W ,n); + mpres_set (P3->W ,P2->W ,n); + mpres_set (t7 ,P1->Y ,n); + mpres_set (P3->Y ,P2->Y ,n); + + + mpres_mul (t9 ,t7 ,P3->Y ,n); + mpres_add (t7 ,t7 ,t3 ,n); + mpres_add (P3->Y ,P3->Y ,P3->V ,n); + mpres_mul (t3 ,t3 ,P3->V ,n); + mpres_mul (t7 ,t7 ,P3->Y ,n); + mpres_sub (t7 ,t7 ,t9 ,n); + mpres_sub (t7 ,t7 ,t3 ,n); // X3 + mpres_mul (P3->V ,t1 ,P3->U ,n); + mpres_mul (P3->Y ,t5 ,P3->W ,n); + mpres_add (t1 ,t1 ,t5 ,n); + mpres_add (P3->U ,P3->U ,P3->W ,n); + mpres_mul (t5 ,t1 ,P3->U ,n); + mpres_sub (t5 ,t5 ,P3->V ,n); + mpres_sub (t5 ,t5 ,P3->Y ,n); + mpres_mul (P3->V ,P3->V ,ep ,n); + mpres_sub (t1 ,P3->Y ,P3->V ,n); // Z3 + mpres_add (P3->U ,P3->Y ,P3->V ,n); + mpres_mul (P3->W ,t3 ,dep ,n); + mpres_sub (P3->W ,t9 ,P3->W ,n); + mpres_mul (P3->W ,P3->W ,P3->U ,n); + mpres_mul_ui (t3 ,t3 ,2 ,n); + mpres_mul (t3 ,t3 ,ep ,n); + mpres_mul (t3 ,t3 ,t5 ,n); + mpres_add (P3->Y ,P3->W ,t3 ,n); // Y3 + + + + mpres_mul (P3->U ,t7 ,t7 ,n); // U3 + mpres_mul (P3->V ,t1 ,t7 ,n); // V3 + mpres_mul (P3->W ,t1 ,t1 ,n); // W3 + + + mpres_clear (t1,n); + mpres_clear (t3,n); + mpres_clear (t5,n); + mpres_clear (t7,n); + mpres_clear (t9,n); +} + + + +/* + add two points on the jacobi curve y^2=1-dep*X^2+ep*X^4 + Initial points P1=(U1,V1,W1,Y1) and P2=(U2,V2,W2,Y2) + P1+P2=P3=(U3,V3,W3,Y3) + WARNING: value of P1 and P2 are modified + P1 and P2 must be different +*/ +void addJacobi2fin(mpmod_t n, + coorJacobi P1,coorJacobi P2, + mpres_t X3,mpres_t Y3,mpres_t Z3, + mpres_t ep,mpres_t dep) { + + mpres_mul (Y3 ,P1->Y ,P2->Y ,n); + mpres_add (P1->Y ,P1->Y ,P1->V ,n); + mpres_add (P2->Y ,P2->Y ,P2->V ,n); + mpres_mul (P1->V ,P1->V ,P2->V ,n); + mpres_mul (P1->Y ,P1->Y ,P2->Y ,n); + mpres_sub (P1->Y ,P1->Y ,Y3 ,n); + mpres_sub (X3 ,P1->Y ,P1->V ,n); // X3 + mpres_mul (P2->V ,P1->U ,P2->U ,n); + mpres_mul (P2->Y ,P1->W ,P2->W ,n); + mpres_add (P1->U ,P1->U ,P1->W ,n); + mpres_add (P2->U ,P2->U ,P2->W ,n); + mpres_mul (P1->W ,P1->U ,P2->U ,n); + mpres_sub (P1->W ,P1->W ,P2->V ,n); + mpres_sub (P1->W ,P1->W ,P2->Y ,n); + mpres_mul (P2->V ,P2->V ,ep ,n); + mpres_sub (Z3 ,P2->Y ,P2->V ,n); // Z3 + mpres_add (P2->U ,P2->Y ,P2->V ,n); + mpres_mul (P2->W ,P1->V ,dep ,n); + mpres_sub (P2->W ,Y3 ,P2->W ,n); + mpres_mul (P2->W ,P2->W ,P2->U ,n); + + mpres_mul_ui (P1->V ,P1->V ,2 ,n); + + mpres_mul (P1->V ,P1->V ,ep ,n); + mpres_mul (P1->V ,P1->V ,P1->W ,n); + mpres_add (Y3 ,P2->W ,P1->V ,n); // Y3 +} + + diff -Nru gmp-ecm-7.0.4+ds/hecm/Jacobi.h gmp-ecm-7.0.5+ds/hecm/Jacobi.h --- gmp-ecm-7.0.4+ds/hecm/Jacobi.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/Jacobi.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,44 @@ +#ifndef _JACOBI_H +#define _JACOBI_H + +#include "../ecm-impl.h" +#include "generation.h" +#include "auxi.h" + +#include + + +#define MULT_JACOBI_FAIL 0 // The multiplication on the Jacobi curve failled +#define MULT_JACOBI 1 // The multiplication on the Jacobi curve is a success + + + +struct coorJacobi_s { + mpres_t U; + mpres_t V; + mpres_t W; + mpres_t Y; +}; +typedef struct coorJacobi_s coorJacobi[1]; + +void coorJacobi_init (coorJacobi P, mpmod_t n ); +void coorJacobi_clear (coorJacobi P, mpmod_t n ); +void coorJacobi_set (coorJacobi P,coorJacobi Q, mpmod_t n ); + + +void mulJacobiEntiers (mpz_t a,mpz_t b,int k,mpz_t x,mpz_t y,mpz_t z); + + +int mulJacobi2 (mpz_t f,mpmod_t n,int k,mpres_t x,mpres_t y,mpres_t Y,mpres_t ep,mpres_t dep); + + +void doubleJacobi2DebFin(mpz_t f,mpmod_t n,mpres_t x,mpres_t y,mpres_t Y,mpres_t ep,mpres_t dep); + + +void doubleJacobi2Deb(mpmod_t n,coorJacobi P2,mpres_t Y,mpres_t ep,mpres_t dep); + +void addJacobi2(mpmod_t n,coorJacobi P1,coorJacobi P2,coorJacobi P3,mpres_t ep,mpres_t dep); + +void addJacobi2fin(mpmod_t n,coorJacobi P1,coorJacobi P2,mpres_t X3,mpres_t Y3,mpres_t Z3,mpres_t ep,mpres_t dep); + +#endif diff -Nru gmp-ecm-7.0.4+ds/hecm/Makefile gmp-ecm-7.0.5+ds/hecm/Makefile --- gmp-ecm-7.0.4+ds/hecm/Makefile 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/Makefile 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,24 @@ + + +CC=gcc +CFLAGS=-O2 -g -Wall -std=c99 + + + + +hecm: auxi.o morphismes.o Jacobi.o generation.o ariKS.o hecm.o stage1HECM.o stage2HECM.o + gcc auxi.o stage1HECM.o morphismes.o Jacobi.o generation.o ariKS.o hecm.o stage2HECM.o ../libecm_la-mpmod.o ../libecm_la-getprime.o ../libecm_la-mul_fft.o ../libecm_la-auxlib.o ../libecm_la-mul_lo.o -o hecm ../.libs/libecm.a -lgmp -lm ../x86_64/.libs/libmulredc.a + +generation.o: ../ecm-impl.h generation.h ariKS.h hecm.h +ariKS.o: ../ecm-impl.h ariKS.h generation.h +auxi.o: auxi.h +morphimes.o: ../ecm-impl.h generation.h ariKS.h morphismes.h Jacobi.h auxi.h +stage1HECM.o: ../ecm-impl.h morphismes.h ariKS.h generation.h hecm.h +stage2HECM.o: ../ecm-impl.h hecm.h +hecm.o: ../ecm-impl.h morphismes.h generation.h auxi.h hecm.h +Jacobi.o: ../ecm-impl.h generation.h Jacobi.h + + + +clean: + rm -f *.o *~ *.bak hecm diff -Nru gmp-ecm-7.0.4+ds/hecm/morphismes.c gmp-ecm-7.0.5+ds/hecm/morphismes.c --- gmp-ecm-7.0.4+ds/hecm/morphismes.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/morphismes.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,2253 @@ +#include "morphismes.h" +#include "auxi.h" +#include "ariKS.h" +#include "Jacobi.h" + + +void DivMumfordU_init (DivMumfordU DivU, mpmod_t n ) { + mpres_init (DivU->u0,n); + mpres_init (DivU->u1,n); + DivU->degree = 0; +} + + +void DivMumfordU_clear (DivMumfordU DivU, mpmod_t n ) { + mpres_clear (DivU->u0,n); + mpres_clear (DivU->u1,n); +} + + +void DivMumfordV_init (DivMumfordV DivV, mpmod_t n ) { + mpres_init (DivV->V0,n); + mpres_init (DivV->V1,n); + mpres_init (DivV->v1v0,n); +} + +void DivMumfordV_clear (DivMumfordV DivV, mpmod_t n ) { + mpres_clear (DivV->V0,n); + mpres_clear (DivV->V1,n); + mpres_clear (DivV->v1v0,n); +} + + + +/* + Let P be a point on the Kummer surface + We want to obtain the Mumford polynomial of the corresponding divisor + A divisor and its opposite have the same image on the Kummer surface + Thus we can't have the exact v polynomial of P + Instead we obtain V=v^2 + There is 3 cases + if degree u=0 then we have the divisor 0 + if degree u=1 we only need the polynomial u for the following + if degree u=2 we use the function DivMumfordDegree2 +*/ +int DivMumford (mpz_t f,mpmod_t n,DivMumfordU divU,DivMumfordV divV, + ksPoint P,thetaCst th,curveHyperEll cHEll) { + + mpres_t t1,t2; + mpres_t T13p, T14p,T16p; + int test; + + mpres_init (t1,n); + mpres_init (t2,n); + + mpres_init (T13p,n); + mpres_init (T14p,n); + mpres_init (T16p,n); + + // Construction of 3 theta functions up to constants + // T13p := X*Rac - Y*t10p*Rac + Z*t7p - T*t7p*t10p; + // T14p := X*t5p*Rac - Y*t6p*t7p + Z*t5p*t7p - T*t6p*Rac; + // T16p := X*t6p*t7p + Y*t5p*Rac - Z*t6p*Rac + T*t5p*t7p; + mpres_mul (T13p ,P->Z ,th->t7p ,n); // Z*t7p + mpres_mul (T14p ,T13p ,th->t5p ,n); // Z*t5p*t7p + mpres_mul (t1 ,P->Y ,th->Rac ,n); + mpres_mul (T16p ,th->t5p ,t1 ,n); // Y*t5p*Rac + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_sub (T13p ,T13p ,t1 ,n); // Z*t7p - Y*t10p*Rac + mpres_mul (t1 ,P->T ,th->t7p ,n); + mpres_mul (t2 ,t1 ,th->t10p ,n); + mpres_sub (T13p ,T13p ,t2 ,n); // Z*t7p - Y*t10p*Rac - T*t7p*t10p + mpres_mul (t2 ,t1 ,th->t5p ,n); + mpres_add (T16p ,T16p ,t2 ,n); // T*t5p*t7p + Y*t5p*Rac + mpres_mul (t1 ,P->X ,th->Rac ,n); + mpres_add (T13p ,T13p ,t1 ,n); // Z*t7p - Y*t10p*Rac - T*t7p*t10p +X*Rac + mpres_mul (t2 ,t1 ,th->t5p ,n); + mpres_add (T14p ,T14p ,t2 ,n); // Z*t5p*t7p + X*t5p*Rac + mpres_mul (t1 ,th->t6p ,th->Rac ,n); + mpres_mul (t2 ,t1 ,P->T ,n); + mpres_sub (T14p ,T14p ,t2 ,n); // Z*t5p*t7p + X*t5p*Rac - T*t6p*Rac + mpres_mul (t2 ,t1 ,P->Z ,n); + mpres_sub (T16p ,T16p ,t2 ,n); // T*t5p*t7p + Y*t5p*Rac - Z*t6p*Rac + mpres_mul (t1 ,th->t6p ,th->t7p ,n); + mpres_mul (t2 ,t1 ,P->Y ,n); + mpres_sub (T14p ,T14p ,t2 ,n);// Z*t5p*t7p + X*t5p*Rac - T*t6p*Rac - Y*t6p*t7p + mpres_mul (t2 ,t1 ,P->X ,n); + mpres_sub (T16p ,T16p ,t2 ,n);// T*t5p*t7p + Y*t5p*Rac - Z*t6p*Rac + X*t6p*t7p + // We have finished T13p,T14p et T16p + // Note that contrary to the papper, we don't divided by t7p^2-Rac^2 + + + + if ( !mpres_is_zero(T16p,n) ) { // T16p != 0 + + divU->degree =2; + + if (!mpres_invert (T16p , T16p, n)) // T16p=1/T16p + { + mpres_gcd (f, T16p, n); + + + mpres_clear (t1,n); + mpres_clear (t2,n); + + mpres_clear (T13p,n); + mpres_clear (T14p,n); + mpres_clear (T16p,n); + return MORPHISM_FAIL; + } + + + + test=DivMumfordDegree2(f,n,divU,divV,P,th,cHEll,T13p,T14p, T16p); + + mpres_clear (t1,n); + mpres_clear (t2,n); + + mpres_clear (T13p,n); + mpres_clear (T14p,n); + mpres_clear (T16p,n); + + return test; + + } + else { // T16p = 0 + + /* + denom := (la-1)*t5p*T13p - la*t8p*T14p; + u0 := la*T14p / denom; + */ + + + mpres_sub_ui (t1 ,cHEll->la ,1 ,n); + mpres_mul (t1 ,t1 ,th->t5p ,n); + mpres_mul (t1 ,t1 ,T13p ,n); // (la-1)*t5p*T13p + mpres_mul (t2 ,cHEll->la ,T14p ,n); // la*t8p*t14p; + mpres_sub (t1, t1 ,t2 ,n); // t1 = denom = (la-1)*t5p*T13p - la*t8p*T14p + + + if ( !mpres_is_zero(t1,n) ) { // denom != 0 + + if (!mpres_invert (t2 , t1, n)) // t2=1/( (la-1)*t5p*T13p - la*t8p*t14p ) + { + mpres_gcd (f, t1, n); + + mpres_clear (t1,n); + mpres_clear (t2,n); + + mpres_clear (T13p,n); + mpres_clear (T14p,n); + mpres_clear (T16p,n); + + return MORPHISM_FAIL; + } + + divU->degree =1; + + mpres_mul (divU->u0 ,cHEll->la ,t2 ,n); + mpres_mul (divU->u0 ,divU->u0 ,T14p ,n); + + // we don't need V. + + mpres_clear (t1,n); + mpres_clear (t2,n); + + mpres_clear (T13p,n); + mpres_clear (T14p,n); + mpres_clear (T16p,n); + + return MORPHISM; + + } + else { // denom = 0 + + divU->degree =0; + + mpres_clear (t1,n); + mpres_clear (t2,n); + + mpres_clear (T13p,n); + mpres_clear (T14p,n); + mpres_clear (T16p,n); + + return MORPHISM; + + } + + } + +} + + + + + + + + + + +/* + Get the Mumford polynomial in the case of degree u=2. + We only get V=v^2 +*/ +int DivMumfordDegree2 (mpz_t f,mpmod_t n,DivMumfordU divU,DivMumfordV divV, + ksPoint P,thetaCst th,curveHyperEll cHEll, + mpres_t T13p, mpres_t T14p, mpres_t T16p) { + + mpres_t t1,t2,t3,t4,t5; + mpres_t T7p,T9p,T11p,T12p; + + mpres_init (t1,n); + mpres_init (t2,n); + mpres_init (t3,n); + mpres_init (t4,n); + mpres_init (t5,n); + + mpres_init (T7p,n); + mpres_init (T9p,n); + mpres_init (T11p,n); + mpres_init (T12p,n); + + /* +T7p := ( - X*t5p + Y*t5p*t10p - Z*t6p*t10p + T*t6p ) / (t6p^2-t5p^2); +T9p := ( X*t6p*t10p - Y*t6p + Z*t5p - T*t5p*t10p ) / (t8p^2-t10p^2); +T11p:= ( X*t6p - Y*t6p*t10p + Z*t5p*t10p - T*t5p ) / (t6p^2-t5p^2); +T12p:= ( - X*t5p*t10p + Y*t5p - Z*t6p + T*t6p*t10p ) / (t8p^2-t10p^2); + */ + + mpres_mul (T7p ,P->T ,th->t6p ,n); // T*t6p + mpres_mul (T12p ,T7p ,th->t10p ,n); // T*t6p*t10p + mpres_mul (t1 ,P->Y ,th->t5p ,n); + mpres_add (T12p ,T12p ,t1 ,n); // T*t6p*t10p + Y*t5p + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_add (T7p ,T7p ,t1 ,n); // T*t6p + Y*t5p*t10p + mpres_mul (t1 ,P->Z ,th->t6p ,n); + mpres_sub (T12p ,T12p ,t1 ,n); // T*t6p*t10p + Y*t5p - Z*t6p + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_sub (T7p ,T7p ,t1 ,n); // T*t6p + Y*t5p*t10p - Z*t6p*t10p + mpres_mul (t1 ,P->X ,th->t5p ,n); + mpres_sub (T7p ,T7p ,t1 ,n); // T*t6p + Y*t5p*t10p - Z*t6p*t10p - X*t5p + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_sub (T12p ,T12p ,t1 ,n); // T*t6p*t10p + Y*t5p - Z*t6p - X*t5p*t10p + // We finished the numerators of T7p and T12p + + mpres_mul (T9p ,P->Z ,th->t5p ,n); // Z*t5p + mpres_mul (T11p ,T9p ,th->t10p ,n); // Z*t5p*t10p + mpres_mul (t1 ,P->X ,th->t6p ,n); + mpres_add (T11p ,T11p ,t1 ,n); // Z*t5p*t10p + X*t6p + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_add (T9p ,T9p ,t1 ,n); // Z*t5p + X*t6p*t10p + mpres_mul (t1 ,P->T ,th->t5p ,n); + mpres_sub (T11p ,T11p ,t1 ,n); // Z*t5p*t10p + X*t6p - T*t5p + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_sub (T9p ,T9p ,t1 ,n); // Z*t5p + X*t6p*t10p - T*t5p*t10p + mpres_mul (t1 ,P->Y ,th->t6p ,n); + mpres_sub (T9p ,T9p ,t1 ,n); // Z*t5p + X*t6p*t10p - T*t5p*t10p - Y*t6p + mpres_mul (t1 ,t1 ,th->t10p ,n); + mpres_sub (T11p ,T11p ,t1 ,n); // Z*t5p*t10p + X*t6p - T*t5p - Y*t6p*t10p + // We finished the numerators of T9p and T11p + + + + mpres_mul (t1, th->t6p ,th->t6p ,n); + mpres_mul (t2 ,th->t5p ,th->t5p ,n); + mpres_sub (t2 ,t1 ,t2 ,n); // t6p^2-t5p^2 + if (!mpres_invert (t1 , t2, n)) // t1=1 / (t6p^2-t5p^2) + { + mpres_gcd (f, t2, n); + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + + mpres_clear (T7p,n); + mpres_clear (T9p,n); + mpres_clear (T11p,n); + mpres_clear (T12p,n); + + return MORPHISM_FAIL; + } + mpres_mul (T7p ,T7p ,t1 ,n); + mpres_mul (T11p ,T11p ,t1 ,n); + // we finished T7p et T11p + + mpres_mul (t1, th->t10p ,th->t10p ,n); + mpres_ui_sub (t2 ,1 ,t1 ,n); + if (!mpres_invert (t1 , t2, n)) // t1=1 / (t8p^2-t10p^2) + { + mpres_gcd (f, t2, n); + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + + mpres_clear (T7p,n); + mpres_clear (T9p,n); + mpres_clear (T11p,n); + mpres_clear (T12p,n); + + return MORPHISM_FAIL; + } + mpres_mul (T9p ,T9p ,t1 ,n); + mpres_mul (T12p ,T12p ,t1 ,n); + // we finished T9p et T12p + + + + + + + // u0 := nu*be*la*T14p / T16p; + // u1 := nu*be*(la-1)*t5p*T13p / T16p - u0-1 ; + mpres_mul (divU->u0 ,T16p ,cHEll->nu ,n); // we already have inversed T16p + mpres_mul (divU->u0 ,divU->u0 ,th->be ,n); // nu*be/T16p + mpres_mul (divU->u1 ,divU->u0 ,th->t5p ,n); // u1 = nu*be*t5p/T16p + mpres_mul (divU->u0 ,divU->u0 ,cHEll->la ,n); + mpres_mul (divU->u0 ,divU->u0 ,T14p ,n); // u0 = la*nu*be*T14p / T16p + mpres_mul (divU->u1 ,divU->u1 ,T13p ,n); // nu*be*t5p*T13p/T16p + mpres_sub_ui (t1 ,cHEll->la ,1 ,n); + mpres_mul (divU->u1 ,divU->u1 ,t1 ,n); // nu*be*(la-1)*t5p*T13p/T16p + mpres_sub (divU->u1 ,divU->u1 ,divU->u0 ,n); + mpres_sub_ui (divU->u1 ,divU->u1 ,1 ,n); // u1=(la-1)*nu*be*t5p*T14p/T16p-u0-1 + + + + + + + // V0:= (p*ga^2*nu^3*T14p/T16p^3) * (t10p^2 + nu^2*be^2 -2) * ( A*D*(be*ga*(T7p*T12p*Rac-2*(XZ+YT))-t7p*t10p*T9p*T11p) + (ga+be)*((1-be*ga)*((be+ga)*(X+T)*(Y+Z)-(X^2+Y^2+Z^2+T^2))+(2-be^2-ga^2)*(XT+YZ)) ); + + mpres_mul (t1 ,th->t10p ,th->t10p ,n); + mpres_mul (divV->V0 ,cHEll->nu, th->be ,n); + mpres_mul (divV->V0 ,divV->V0 ,divV->V0 ,n); + mpres_add (divV->V0 ,t1 ,divV->V0 ,n); + mpres_sub_ui (divV->V0 ,divV->V0 ,2 ,n); // ( t10p^2 + nu^2*be^2 -2 ) + + mpres_mul (t1 ,cHEll->nu, T16p ,n); // On avait deja inverse T16p + mpres_mul (t2 ,t1 ,th->ga ,n); + mpres_mul (t2 ,t2 ,t2 ,n); + mpres_mul (t2 ,t2 ,t1 ,n); // ga^2 * nu^3 / T16p^3 + mpres_mul (divV->V0 ,divV->V0 ,t2 ,n); + mpres_mul (divV->V0 ,divV->V0 ,T14p ,n); + // V0 = (t10p^2+nu^2*be^2-2)*ga^2*nu^3*T14p / T16p^3 + mpres_mul (divV->V0 ,divV->V0 ,th->p ,n); + // V0 = p*(t10p^2+nu^2*be^2-2)*ga^2*nu^3*T14p / T16p^3 + + + mpres_mul (t1, P->X ,P->Z ,n); + mpres_mul (t2, P->Y ,P->T ,n); + mpres_add (t2 ,t2 ,t1 ,n); // XZ+YT + mpres_mul_ui (t2 ,t2 ,2 ,n); + mpres_mul (t1, T7p ,T12p ,n); + mpres_mul (t1, t1 ,th->Rac ,n); // Rac*T7p*T12p + mpres_sub (t1 ,t1 ,t2 ,n); // Rac*T7p*T12p -2*(XZ+YT) + mpres_mul (t1 ,t1 ,th->be ,n); + mpres_mul (t1 ,t1 ,th->ga ,n); // be*ga* ( T7p*T12p*Rac - 2*(XZ+YT) ) + + mpres_mul (t2 ,T9p ,T11p ,n); + mpres_mul (t2 ,t2 ,th->t7p ,n); + mpres_mul (t2 ,t2 ,th->t10p ,n); // al*de*t7p^2*T9p*T11p*R=t7p*t10p*T9p*T11p + mpres_sub (t1 ,t1 ,t2 ,n); + // t1 = be*ga* ( T7p*T12p*Rac - 2*(XZ+YT) ) - t7p*t10p*T9p*T11p + mpres_add (t4 ,th->be ,th->ga ,n); // t4=be+ga + mpres_add_ui (t2 ,t4 ,2 ,n); // t2 = A = 2+be+ga + mpres_ui_sub (t3 ,2 ,t4 ,n); // t3 = D = 2-be-ga + mpres_mul (t2 ,t3 ,t2 ,n); + mpres_mul (t1 ,t1 ,t2 ,n); + // t1 = A*D*( be*ga* ( T7p*T12p*Rac - 2*(XZ+YT) ) - t7p*t10p*T9p*T11p ) + + mpres_add (t2 ,P->Z ,P->Y ,n); + mpres_add (t3 ,P->X, P->T ,n); + mpres_mul (t2 ,t2 ,t3 ,n); // (X+T)*(Y+Z) + mpres_mul (t2 ,t2 ,t4 ,n); // t2 = (be+ga)*(X+T)*(Y+Z) + mpres_mul (t3 ,P->X ,P->X ,n); + mpres_mul (t4 ,P->Y ,P->Y ,n); + mpres_add (t3 ,t3 ,t4 ,n); + mpres_mul (t4 ,P->Z ,P->Z ,n); + mpres_add (t3 ,t3 ,t4 ,n); + mpres_mul (t4 ,P->T ,P->T ,n); + mpres_add (t3 ,t3 ,t4 ,n); // t3 = X^2+Y^2+Z^2+T^2 + mpres_sub (t2 ,t2 ,t3 ,n); + mpres_mul (t3 ,th->be ,th->ga ,n); + mpres_ui_sub (t3 ,1 ,t3 ,n); + mpres_mul (t2 ,t2 ,t3 ,n); + // t2 = (1-be*ga) * ( (be+ga)*(X+T)*(Y+Z) - (X^2+Y^2+Z^2+T^2)) + + mpres_mul (t3 ,th->be, th->be, n); + mpres_mul (t4 ,th->ga, th->ga ,n); + mpres_add (t3 ,t3 ,t4 ,n); + mpres_ui_sub (t3 ,2 ,t3 ,n); // 2-be^2-ga^2 + mpres_mul (t4 ,P->X ,P->T ,n); + mpres_mul (t5 ,P->Y ,P->Z ,n); + mpres_add (t4 ,t4 ,t5 ,n); + mpres_mul (t3 ,t3 ,t4 ,n); // (2-be^2-ga^2)*(XT+YZ) + mpres_add (t2 ,t2 ,t3 ,n); + //t2=(1-be*ga)*((be+ga)*(X+T)*(Y+Z)-(X^2+Y^2+Z^2+T^2)) + (2-be^2-ga^2)*(XT+YZ) + mpres_add (t4 ,th->be ,th->ga ,n); // t4=be+ga + mpres_mul (t2 ,t2 ,t4 ,n); + //t2=(be+ga)*((1-be*ga)*((be+ga)*(X+T)*(Y+Z)-(X^2+Y^2+Z^2+T^2))+(2-be^2-ga^2)*(XT+YZ)) + + mpres_add (t1 ,t1 ,t2 ,n); + mpres_mul (divV->V0 ,divV->V0 ,t1 ,n); + // We finished V0 + + + + + + + if ( mpres_is_zero (divU->u0,n) ) { // u0=0 + // u0=0 => V0=0, v1v0=0, V1=f(-u1)/u1^2 + + mpres_set_ui (divV->v1v0 ,0 ,n ); + + mpres_add_ui (divV->V1 ,divU->u1 ,1 ,n); + mpres_mul (divV->V1 ,divV->V1 ,divU->u1 ,n); + mpres_add (t1 ,divU->u1 ,cHEll->la ,n); + mpres_mul (divV->V1 ,divV->V1 ,t1 ,n); + mpres_add (t1 ,divU->u1 ,cHEll->mu ,n); + mpres_mul (divV->V1 ,divV->V1 ,t1 ,n); + mpres_add (t1 ,divU->u1 ,cHEll->nu ,n); + mpres_mul (divV->V1 ,divV->V1 ,t1 ,n); + mpres_neg (divV->V1 ,divV->V1 ,n); // f(-u1) + + if (!mpres_invert (t1 , divU->u1, n)) // t1=1 / u1 + { + mpres_gcd (f, divU->u0, n); + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + mpres_clear (T7p,n); + mpres_clear (T9p,n); + mpres_clear (T11p,n); + mpres_clear (T12p,n); + + return MORPHISM_FAIL; + } + mpres_mul (divV->V1 ,divV->V1 ,t1 ,n); + mpres_mul (divV->V1 ,divV->V1 ,t1 ,n); // V1=f(-u1)/u1^2 + + + } + else { // u0 != 0 + + + if ( mpres_is_zero (divV->V0,n) ) { // V0=0 + // u0 != 0, V0 = 0 => v1v0=0 + /* + V1:=-1/u0*( u0^2*Coefficient(f,4)-u0^2*u1-Coefficient(f,2)*u0+Coefficient(f,1)*u1)); + */ + + if (!mpres_invert (t1 , divU->u0, n)) // t1=1 / u0 + { + mpres_gcd (f, divU->u0, n); + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + + mpres_clear (T7p,n); + mpres_clear (T9p,n); + mpres_clear (T11p,n); + mpres_clear (T12p,n); + + return MORPHISM_FAIL; + + } + + + mpres_set_ui (divV->v1v0 ,0 ,n); + + + mpres_mul (t2 ,cHEll->la ,cHEll->mu ,n); + mpres_mul (t3 ,t2 ,cHEll->nu ,n); // t3 = la*mu*nu = Coefficient(f,1) + mpres_add (t2 ,t2 ,t3 ,n); + mpres_add (t5 ,cHEll->la ,cHEll->mu ,n); + mpres_mul (t4 ,cHEll->nu ,t5 ,n); + mpres_add (t2 ,t2 ,t4 ,n); + mpres_neg (t2 ,t2, n); + // t2= - (la*mu*nu+la*mu+la*nu+mu*nu) = Coefficient(f,2) + mpres_add(t4 ,t5 ,cHEll->nu ,n); + mpres_add_ui (t4 ,t4 ,1 ,n); + mpres_neg (t4 ,t4 ,n); // t4 = -(la+mu+nu+1) = Coefficient(f,4) + + mpres_mul (divV->V1 ,divU->u0 ,t4 ,n); + mpres_mul (t5, divU->u0 ,divU->u1 ,n); + mpres_sub (divV->V1 ,t5, divV->V1 ,n); + mpres_add (divV->V1 ,divV->V1 ,t2 ,n); + // -u0*Coefficient(f,4)+u0*u1+Coefficient(f,2) + mpres_mul (t5 ,t3 ,divU->u1 ,n); + mpres_mul (t3 ,t5 ,t1 ,n); // Coefficient(f,1)*u1/u0 + mpres_sub (divV->V1 ,divV->V1 ,t3 ,n); + + } + else { // u0 !=0 , V0 !=0 + + /* + V1:=( u0^3+V0*u1 -u1^2*u0^2 -Coefficient(f,3)*u0^2 +Coefficient(f,1)*u0 + Coefficient(f,4)*u1*u0^2 )^2/( 4*V0*u0^2 ); + v1v0:=( u0^3+V0*u1 -u1^2*u0^2 -Coefficient(f,3)*u0^2 +Coefficient(f,1)*u0 + Coefficient(f,4)*u1*u0^2 )/( 2*u0 ); + */ + if (!mpres_invert (t1 , divU->u0, n)) // t1=1 / u0 + { + mpres_gcd (f, t1, n); + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + + mpres_clear (T7p,n); + mpres_clear (T9p,n); + mpres_clear (T11p,n); + mpres_clear (T12p,n); + + return MORPHISM_FAIL; + } + + + mpres_mul (t3 ,cHEll->la ,cHEll->mu ,n); + mpres_mul (t2 ,t3 ,cHEll->nu ,n); // t2 = la*mu*nu = Coefficient(f,1) + mpres_add (t5 ,cHEll->la ,cHEll->mu ,n); + mpres_mul (t4 ,cHEll->nu ,t5 ,n); + mpres_add (t3 ,t3 ,t4 ,n); + mpres_add (t4 ,t5 ,cHEll->nu ,n); + mpres_add (t3 ,t3 ,t4 ,n); + // t3 =la*mu+la*nu+mu*nu + la+mu+nu = Coefficient(f,3) + mpres_add_ui (t4 ,t4 ,1 ,n); + mpres_neg (t4 ,t4 ,n); // t4 = -(la+mu+nu+1) = Coefficient(f,4) + + + mpres_sub (divV->v1v0 ,t4 ,divU->u1 ,n); + mpres_mul (divV->v1v0 ,divV->v1v0 ,divU->u1 ,n);//u1*(-u1+Coef(f,4)) + mpres_sub (divV->v1v0 ,divV->v1v0 ,t3 ,n); + mpres_add (divV->v1v0 ,divV->v1v0 ,divU->u0 ,n); + // u0 - u1^2 - Coefficient(f,3) + u1*Coefficient(f,4) + mpres_mul (divV->v1v0 ,divV->v1v0 ,divU->u0 ,n); + mpres_add (divV->v1v0 ,divV->v1v0 ,t2 ,n); + //u0^2-u0*u1^2-u0*Coefficient(f,3)+u0*u1*Coefficient(f,4)+Coefficient(f,1) + + mpres_mul (t2 ,divV->V0 ,t1 ,n); + mpres_mul (t1 ,divU->u1 ,t2 ,n); + mpres_add (divV->v1v0 ,divV->v1v0 ,t1 ,n); + + mpres_set_ui (t1 ,2 ,n); + mpres_invert (t2 ,t1, n); // t2 = 1/2 + mpres_mul (divV->v1v0 ,divV->v1v0 ,t2 ,n); + + + if (!mpres_invert (t1 , divV->V0, n)) // t1=1 / V0 + { + mpres_gcd (f, divV->V0, n); + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + + mpres_clear (T7p,n); + mpres_clear (T9p,n); + mpres_clear (T11p,n); + mpres_clear (T12p,n); + + return MORPHISM_FAIL; + } + mpres_mul (divV->V1 ,divV->v1v0 ,divV->v1v0 ,n); + mpres_mul (divV->V1 ,divV->V1 ,t1 ,n); + + + } + } + + + mpres_clear (t1, n); + mpres_clear (t2, n); + mpres_clear (t3, n); + mpres_clear (t4, n); + mpres_clear (t5, n); + + mpres_clear (T7p, n); + mpres_clear (T9p, n); + mpres_clear (T11p, n); + mpres_clear (T12p, n); + + + return MORPHISM; + +} + + + + + + + + + + + +// ********************************************************************** + + + + + + + + + + + + +/* + Input: The Mumford polynomial u of degree 1 + The square of the Mumford polynomial v // V=v^2 + Output: The points on the two underlying elliptic curves + These curves are in short Weierstrass form and we get A + + On the hyperelliptic curve we have the point (x::1) with x=-u0/u1 +*/ +int jac_to_EllW (mpz_t f, mpmod_t n, + curve *T1, curve *T2, + DivMumfordU divU, DivMumfordV divV, + curveHyperEll cHEll) { + int test; + + if (divU->degree == 0) { // u=0 + return MORPHISM_FOUND_ZERO_CURVE_1_AND_2; + } + else if (divU->degree == 1) { // degre de u =1 + test = jac_to_EllW_Degree1 (f,n,T1,T2,divU,cHEll); + } + else { // degre de u =2 + test = jac_to_EllW_Degree2 (f,n,T1,T2,divU,divV,cHEll); + } + + return test; +} + + + + + +/* + Input: The Mumford polynomial u of degree 1 + The square of the Mumford polynomial v // V=v^2 + Output: The points on the two underlying elliptic curves + These curves are in short Weierstrass form and we get A + + On the hyperelliptic curve we have the point (x::1) with x=-u0/u1 +*/ +int jac_to_EllW_Degree1 (mpz_t f, mpmod_t n, + curve *T1, curve *T2, + DivMumfordU divU, + curveHyperEll cHEll) { + int test; + mpres_t A2,a6; + mpres_t x,z; + + mpres_init (A2 ,n); + mpres_init (a6 ,n); + mpres_init (x ,n); + mpres_init (z ,n); + + mpres_neg (x ,divU->u0 ,n); + mpz_set (z ,divU->u1); + + // for the first curve + test = coeff_EllW (f,n,T1,A2,a6,cHEll ); + if (test == MORPHISM_FAIL ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FAIL; + } + test = HEll_EllW_degree1 (f,n ,x,z, T1, A2,a6 ,cHEll); + if (test == MORPHISM_FAIL ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return test; + } + if (test == MORPHISM_FOUND_ZERO_CURVE ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FOUND_ZERO_CURVE_1; + } + + + + + + // for the second curve + mpres_neg (cHEll->q ,cHEll->q ,n); + test = coeff_EllW (f,n,T2,A2,a6,cHEll ); + if (test == MORPHISM_FAIL ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FAIL; + } + test = HEll_EllW_degree1 (f,n ,x,z, T1, A2,a6 ,cHEll); + if (test != MORPHISM ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return test; + } + if (test == MORPHISM_FOUND_ZERO_CURVE ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FOUND_ZERO_CURVE_1; + } + + + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM; +} + + + + + + + +/* + special case of jac_to_EllW_degree2: when one of y1 ,y2 is zero + coeff[0]=0 <=> al^2-be^2*delta <=> y1 or y2 is zero + In this case x1=0,1,la,mu or nu + First we find which one it is. Then it is possible to send the other point on + the elliptic curves. + Theoricaly we should obtain phi((x1,0))+phi((x2,y2)) on the curve with + phi((x1,0)) zero or of 2-torsion. + We compute 2*phi((x2,y2)) and obtain a multiple of the point + */ +int degree2_case_y1_equal_zero (mpz_t f,mpmod_t n,curve *T1, curve *T2,DivMumfordU divU, DivMumfordV divV, curveHyperEll cHEll) { + + int test; + mpres_t t1,t2; + mpres_t g; + mpres_t x,z; + mpres_t A2,a6; + + mpres_init (t1,n); + mpres_init (t2,n); + mpres_init (g,n); + mpres_init (x,n); + + mpres_gcd (f ,divU->u0 ,n); + mpres_set_z (g ,f ,n); + if ( mpres_is_zero (g,n) ) { // x1 = 0 + mpres_neg (x ,divU->u1 ,n); // x2=-u1-x1 + } + else if ( mpz_cmp_ui (f,1) != 0 ) { // f !=0,1,n so it has a factor of n + mpres_clear (x ,n); + mpres_clear (g ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + return MORPHISM_FAIL; + } + else { + mpres_add_ui (t1 ,divU->u1 ,1 ,n); + mpres_add (t1 ,t1 ,divU->u0 ,n); // 1^2 + u1*1 + u0 + + mpres_gcd (f ,t1 ,n); + mpres_set_z (g ,f ,n); + if ( mpres_is_zero (g,n) ) { // x1 = 1 + mpres_add_ui (x ,divU->u1 ,1 ,n); + mpres_neg (x ,x ,n); // x2=-u1-x1 + } + else if ( mpz_cmp_ui (f,1) != 0 ) { // f !=0,1,n so it has a factor of n + mpres_clear (g ,n); + mpres_clear (x ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + return MORPHISM_FAIL; + } + else { + mpres_mul (t1 ,divU->u1 ,cHEll->la ,n); + mpres_add (t1 ,t1 ,divU->u0 ,n); + mpres_mul (t2 ,cHEll->la ,cHEll->la ,n); + mpres_add (t1 ,t1 ,t2 ,n); // la^2 + u1*la + u0 + + mpres_gcd (f ,t1 ,n); + mpres_set_z (g ,f ,n); + if ( mpres_is_zero (g,n) ) { // x1 = la + mpres_add (x ,divU->u1 ,cHEll->la ,n); + mpres_neg (x ,x ,n); // x2=-u1-x1 + } + else if ( mpz_cmp_ui (f,1) != 0 ) { + // f !=0,1,n so it has a factor of n + mpres_clear (g ,n); + mpres_clear (x ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + return MORPHISM_FAIL; + } + else { + mpres_mul (t1 ,divU->u1 ,cHEll->mu ,n); + mpres_add (t1 ,t1 ,divU->u0 ,n); + mpres_mul (t2 ,cHEll->mu ,cHEll->mu ,n); + mpres_add (t1 ,t1 ,t2 ,n); // mu^2 + u1*mu + u0 + + mpres_gcd (f ,t1 ,n); + mpres_set_z (g ,f ,n); + if ( mpres_is_zero (g,n) ) { // x1 = mu + mpres_add (x ,divU->u1 ,cHEll->mu ,n); + mpres_neg (x ,x ,n); // x2=-u1-x1 + } + else if ( mpz_cmp_ui (f,1) != 0 ) { + // f !=0,1,n so it has a factor of n + mpres_clear (g ,n); + mpres_clear (x ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + return MORPHISM_FAIL; + } + else { + mpres_mul (t1 ,divU->u1 ,cHEll->nu ,n); + mpres_add (t1 ,t1 ,divU->u0 ,n); + mpres_mul (t2 ,cHEll->nu ,cHEll->nu ,n); + mpres_add (t1 ,t1 ,t2 ,n); // nu^2 + u1*nu + u0 + + mpres_gcd (f ,t1 ,n); + mpres_set_z (g ,f ,n); + if ( mpres_is_zero (g,n) ) { // x1 = nu + mpres_add (x ,divU->u1 ,cHEll->nu ,n); + mpres_neg (x ,x ,n); // x2=-u1-x1 + } + else if ( mpz_cmp_ui (f,1) != 0 ) { + // f !=0,1,n so it has a factor of n + mpres_clear (g ,n); + mpres_clear (x ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + return MORPHISM_FAIL; + } + else { + mpres_clear (g ,n); + mpres_clear (x ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + return MORPHISM_ERROR; + } + } + } + } + } + mpres_clear (g ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + + mpres_init( A2 ,n); + mpres_init( a6 ,n); + mpres_init( z ,n); + + // we have set x2 so we send it to the elliptic curve + mpres_set_ui (z ,1 ,n); + + // for the first curve + test = coeff_EllW (f,n,T1,A2,a6,cHEll ); + if (test == MORPHISM_FAIL ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FAIL; + } + + test = HEll_EllW_degree1 (f,n ,x,z, T1, A2,a6 ,cHEll); + if (test == MORPHISM_FAIL ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FAIL; + } + if (test == MORPHISM_FOUND_ZERO_CURVE ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FOUND_ZERO_CURVE_1; + } + if (test == MORPHISM_STEP1 ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_STEP1; + } + + + // for the second curve + mpres_neg (cHEll->q ,cHEll->q ,n); + test = coeff_EllW (f,n,T2,A2,a6,cHEll ); + if (test == MORPHISM_FAIL ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FAIL; + } + + test = HEll_EllW_degree1 (f,n ,x,z, T1, A2,a6 ,cHEll); + if (test != MORPHISM ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return test; + } + if (test == MORPHISM_FOUND_ZERO_CURVE ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_FOUND_ZERO_CURVE_1; + } + if (test == MORPHISM_STEP1 ){ + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + return MORPHISM_STEP1; + } + + mpres_clear (A2 ,n); + mpres_clear (a6 ,n); + mpres_clear (x ,n); + mpres_clear (z ,n); + + + // double the point on the two curve + if ( double_short_weierstrass (f,n,T1 )==0 ) { + return MORPHISM_STEP1; + } + if ( double_short_weierstrass (f,n,T2 )==0 ) { + return MORPHISM_STEP1; + } + + + + + return MORPHISM; + } + + + + + + + + + + + + + + + + + + +/* + Input: The Mumford polynomial u of degree 2 + The square of the Mumford polynomial v // V=v^2 + Output: The points on the two underlying elliptic curves + These curves are in short Weierstrass form and we get A + + +*/ +int jac_to_EllW_Degree2 (mpz_t f, mpmod_t n, + curve *T1, curve *T2, + DivMumfordU divU, DivMumfordV divV, + curveHyperEll cHEll) { + int test; + mpalgpol_t pol; + mpalgres_t x1,x2,y1,y2,z1,z2; + mpalgres_t X1,X2,Y1,Y2,Z1,Z2; + mpres_t al,be; + mpres_t t1,t2; + mpres_t A2,a6; + mpalgres_t a4twist,R,B,invB; + + mpres_init (al,n); + mpres_init (be,n); + mpres_init (t1,n); + mpres_init (t2,n); + + mpalgpol_init (pol,n); + mpalgres_init (x1,n); + mpalgres_init (x2,n); + mpalgres_init (y1,n); + mpalgres_init (y2,n); + mpalgres_init (z1,n); + mpalgres_init (z2,n); + mpalgres_init (B,n); + mpalgres_init (invB,n); + + + + mpres_set_ui (pol->coeff[1] ,0,n); + mpres_mul_ui (divV->v1v0 ,divV->v1v0 ,4 ,n); //since we only use 4*v1v0 + + // Construction of delta such that x1=-u1+sqrt(delta) + mpres_mul (be ,divU->u1 ,divU->u1 ,n); // be=u1^2 + mpres_mul_ui (t2 ,divU->u0 ,4 ,n); + mpres_sub (t2 ,be ,t2 ,n); // t2 = delta = u1^2-4*u0 + mpres_neg (pol->coeff[0] ,t2 ,n); // -delta + + + + mpres_neg (x1[0] ,divU->u1 ,n); + mpres_neg (x2[0] ,divU->u1 ,n); + mpres_set_ui (x1[1] ,1 ,n); // x1 = -u1+sqrt(delta) + mpres_set_si (x2[1] ,-1 ,n); // x2 = -u1-sqrt(delta) + + mpalgres_set_ui (z1 ,2 ,n); // z1 = 2 + mpalgres_set_ui (z2 ,2 ,n); // z2 = 2 + + + + mpalgres_set_ui (y1 ,1 ,n); // y1' = 1 + + + + // we have B = y1^2 = al + be* sqrt(delta) + // al = 4V0 -4*v1v0*u1 + V1*(delta+u1^2) + // be = 4*v1v0 - 2u1*V1 + mpres_add (al ,t2 ,be ,n); + mpres_mul (al ,al ,divV->V1 ,n); // V1*(delta+u1^2) + mpres_mul (t1 ,divV->v1v0 ,divU->u1 ,n); // 4*v1v0*u1 + mpres_mul_ui (be ,divV->V0 ,4 ,n); // 4V0 + mpres_sub (t1 ,be ,t1 ,n);// t1 = 4V0 -4*v1v0*u1 + mpres_add (al ,al ,t1 ,n ); // al = 4V0 -4*v1v0*u1 + V1*(delta+u1^2) + + mpres_mul (be ,divV->V1 ,divU->u1 ,n); + mpres_mul_ui (be ,be ,2 ,n); + mpres_sub (be ,divV->v1v0 ,be ,n); // be = 4*v1v0 - 2u1*V1 + + mpz_set (B[0] ,al); + mpz_set (B[1] ,be); + + + if (mpalgres_is_zero (B ,pol,n) ) { // y1=0 + mpres_clear (al,n); + mpres_clear (be,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + + test = degree2_case_y1_equal_zero (f,n, T1,T2, divU,divV,cHEll); + return test; + } + + test = mpalgres_invert (invB ,B ,pol,n,f); // 1/B=1/y1^2 + if ( test == -1 ) { + // This should not happen + mpres_clear (al,n); + mpres_clear (be,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + return MORPHISM_ERROR; + } + if ( test == 0 ) { + mpres_clear (al,n); + mpres_clear (be,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + return MORPHISM_FAIL; + } + // ok we have inverted B + + + + // y2' = y2/y1 = y2y1/y1^2 = y2y1*invB + // y1*y2 = 4*u0*V1 - 4*v0v1*u1 + 4V0 + + + mpres_mul (t2 ,divU->u0 ,divV->V1 ,n); + mpres_mul_ui(t2 ,t2 ,4 ,n); + mpres_add (t1 ,t1 ,t2 ,n); // 4*u0*V1 - 4*v0v1*u1 + 4V0 + mpalgres_mul_mpres (y2 ,invB ,t1 ,pol ,n); + + + + mpres_clear (al,n); + mpres_clear (be,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + + + + + // we have construct the points on the hyperelliptic curve in the good algebra + // now go to the elliptic curves + + + mpalgres_init (R,n); + mpalgres_init (a4twist,n); + mpres_init (A2,n); + mpres_init (a6,n); + + mpalgres_init (X1,n); + mpalgres_init (X2,n); + mpalgres_init (Y1,n); + mpalgres_init (Y2,n); + mpalgres_init (Z1,n); + mpalgres_init (Z2,n); + + + + // the first one: + test= coeff_EllW_EllWtwist (f,n ,pol ,T1,a6,A2 ,R,a4twist ,B,invB,cHEll); + if (test == MORPHISM_FAIL) { + mpalgres_clear (R,n); + mpalgres_clear (a4twist,n); + mpres_clear (a6,n); + mpres_clear (A2,n); + + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (X1,n); + mpalgres_clear (X2,n); + mpalgres_clear (Y1,n); + mpalgres_clear (Y2,n); + mpalgres_clear (Z1,n); + mpalgres_clear (Z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + return MORPHISM_FAIL; + } + + HEll_to_EllW(n,pol, X1,Y1,Z1, x1,y1,z1, cHEll ); + HEll_to_EllW(n,pol, X2,Y2,Z2, x2,y2,z2, cHEll ); + EllW_to_EllWshort(n,pol, R,A2, X1,Y1,Z1, X1,Y1,Z1); + EllW_to_EllWshort(n,pol, R,A2, X2,Y2,Z2, X2,Y2,Z2); + test= addW_and_scale (f,n,pol,T1,a6,X1,Y1,Z1,X2,Y2,Z2,a4twist,R); + + if ( test != MORPHISM ) { + mpalgres_clear (R,n); + mpalgres_clear (a4twist,n); + mpres_clear (a6,n); + mpres_clear (A2,n); + + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (X1,n); + mpalgres_clear (X2,n); + mpalgres_clear (Y1,n); + mpalgres_clear (Y2,n); + mpalgres_clear (Z1,n); + mpalgres_clear (Z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + + if (test == MORPHISM_FOUND_ZERO_CURVE) { + return MORPHISM_FOUND_ZERO_CURVE_1; + } + else { + return test; + } + } + + + + + + // the second one: + mpres_neg (cHEll->q ,cHEll->q ,n); + test= coeff_EllW_EllWtwist (f,n ,pol ,T2,a6,A2 ,R,a4twist ,B,invB,cHEll); + if (test == MORPHISM_FAIL) { + mpalgres_clear (R,n); + mpalgres_clear (a4twist,n); + mpres_clear (a6,n); + mpres_clear (A2,n); + + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (X1,n); + mpalgres_clear (X2,n); + mpalgres_clear (Y1,n); + mpalgres_clear (Y2,n); + mpalgres_clear (Z1,n); + mpalgres_clear (Z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + return MORPHISM_FAIL; + } + + HEll_to_EllW(n,pol, X1,Y1,Z1, x1,y1,z1, cHEll ); + HEll_to_EllW(n,pol, X2,Y2,Z2, x2,y2,z2, cHEll ); + EllW_to_EllWshort(n,pol, R,A2, X1,Y1,Z1, X1,Y1,Z1); + EllW_to_EllWshort(n,pol, R,A2, X2,Y2,Z2, X2,Y2,Z2); + test= addW_and_scale (f,n,pol,T2,a6,X1,Y1,Z1,X2,Y2,Z2,a4twist,R); + + if ( test != MORPHISM ) { + mpalgres_clear (R,n); + mpalgres_clear (a4twist,n); + mpres_clear (a6,n); + mpres_clear (A2,n); + + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (X1,n); + mpalgres_clear (X2,n); + mpalgres_clear (Y1,n); + mpalgres_clear (Y2,n); + mpalgres_clear (Z1,n); + mpalgres_clear (Z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + + if (test == MORPHISM_FOUND_ZERO_CURVE) { + return MORPHISM_FOUND_ZERO_CURVE_1; + } + else { + return test; + } + } + + + + mpalgres_clear (R,n); + mpalgres_clear (a4twist,n); + mpres_clear (a6,n); + mpres_clear (A2,n); + + mpalgpol_clear (pol,n); + mpalgres_clear (x1,n); + mpalgres_clear (x2,n); + mpalgres_clear (y1,n); + mpalgres_clear (y2,n); + mpalgres_clear (z1,n); + mpalgres_clear (z2,n); + + mpalgres_clear (X1,n); + mpalgres_clear (X2,n); + mpalgres_clear (Y1,n); + mpalgres_clear (Y2,n); + mpalgres_clear (Z1,n); + mpalgres_clear (Z2,n); + + mpalgres_clear (B,n); + mpalgres_clear (invB,n); + return MORPHISM; +} + + + + + + + + + +/* + send a point (x,y,z) on the hyperelliptic curve to the underlying curve + defined by cHEll->q (note that the other is defined by -q) + Work with coordinates in k[Y]/pol(Y) + return the result in (x,y,z) + + (x,y,z) --> ( wd*( x-(mu+q)z )^2*( x-(mu-q)z ) ,wn*y*z^2, wd*(x-(mu-q)z)^3 ) + wn = 8*q wd = (mu-q)*( -1+(mu-q) ) +*/ +void HEll_to_EllW(mpmod_t n,mpalgpol_t pol, + mpalgres_t aX2,mpalgres_t aY2,mpalgres_t aZ2, + mpalgres_t aX ,mpalgres_t aY ,mpalgres_t aZ , + curveHyperEll cHEll ) { + mpalgres_t aT1,aT2; + mpalgres_t x,y,z; + mpres_t temp1,temp2; + + mpres_init (temp1 ,n); + mpres_init (temp2 ,n); + mpalgres_init (aT1 ,n); + mpalgres_init (aT2 ,n); + mpalgres_init (x,n); + mpalgres_init (y,n); + mpalgres_init (z,n); + + mpalgres_set (x,aX,n); + mpalgres_set (y,aY,n); + mpalgres_set (z,aZ,n); + + + mpres_mul_ui (temp1 ,cHEll->q ,8 ,n); + mpalgres_mul (aT1 ,z ,z ,pol ,n); + mpalgres_mul (y ,y ,aT1 ,pol ,n); + mpalgres_mul_mpres (y ,y ,temp1 ,pol ,n); // 8*q * y * z^2 + + mpres_add (temp1 ,cHEll->mu ,cHEll->q ,n); + mpalgres_mul_mpres (aT2 ,z ,temp1 ,pol ,n); // (mu+q)z + mpres_sub (temp1 ,cHEll->mu ,cHEll->q ,n);//(mu-q) + mpalgres_mul_mpres (aT1 ,z ,temp1 ,pol ,n); // (mu-q)z + + mpalgres_sub (z ,x ,aT1 ,pol ,n); // x-(mu-q)z + mpalgres_sub (x ,x ,aT2 ,pol ,n); // x-(mu+q)z + + + + + mpalgres_mul (x ,x ,x ,pol ,n); + mpalgres_mul (x ,x ,z ,pol ,n); // ( x-(mu+q)z )^2*( x-(mu-q)z ) + + + mpalgres_mul (aT1 ,z ,z ,pol ,n); + mpalgres_mul (z ,z ,aT1 ,pol ,n); // ( x-(mu-q)z )^3 + + mpres_sub_ui (temp2 ,temp1 ,1 ,n); + mpres_mul (temp1 ,temp1 ,temp2 ,n); // wd = (mu-q)*( -1+(mu-q) ) + + mpalgres_mul_mpres (x ,x ,temp1 ,pol ,n); + mpalgres_mul_mpres (z ,z ,temp1 ,pol ,n); + + + + mpalgres_set (aX2,x,n); + mpalgres_set (aY2,y,n); + mpalgres_set (aZ2,z,n); + + + mpres_clear (temp1 ,n); + mpres_clear (temp2 ,n); + mpalgres_clear (aT1 ,n); + mpalgres_clear (aT2 ,n); + mpalgres_clear (x,n); + mpalgres_clear (y,n); + mpalgres_clear (z,n); +} + + + + + + + + +/* + Get many coefficient of the short weierstrass curve + + the long weierstrass curve is given by + B*rc* y^2*z = ( x - z ) * ( x - x2^2 *z ) * ( x - x3^2 *z ) + = x^3 + A2 * x^2 + A4 * x + A6 + rc = -q*mu*(mu-1) + x2 = (mu+q) / (mu-q) x3 = (1-(mu+q)) / (1-(mu-q)) + first get the curve + R * y^2*z = x^3 + a4' * x * z^2 + a6' * z^3 + by (x,y,z) -> (x+A2/3*z,y,z) + a4 = A4 - A2^2/3 a6 = A6 - A2*A4/3 + 2*A2^3/27 + R=B*rc + get the curve + y^2*z = x^3 + a4' * x * z^2 + a6' * z^3 + by (x,y,z) -> (x,y,R*z) + a4' = a4/R^2 a6' = a6/R^3 + + It get the coefficient a4 and a6 of the short weierstrass curve + a4 is put in T->A + A2 of the long weierstrass form + a4twist of the short weierstrass twisted curve + R +*/ +int coeff_EllW_EllWtwist(mpz_t f,mpmod_t n,mpalgpol_t pol, + curve *T,mpres_t a6,mpres_t A2, + mpalgres_t R, mpalgres_t a4twist, + mpalgres_t B,mpalgres_t invB,curveHyperEll cHEll ) { + + mpres_t t1,t2,t3,t4; + mpalgres_t aTemp; + + mpalgres_set_zero (a4twist ,n); + mpalgres_set_zero (R ,n); + + + mpres_init (t1,n); + mpres_init (t2,n); + mpres_init (t3,n); + mpres_init (t4,n); + + mpres_set_ui (R[1] ,0 ,n); + + mpres_sub (t1, cHEll->mu, cHEll->q ,n); // t1 = mu-q + mpres_ui_sub (t2 ,1 ,t1 ,n); // t2 = 1-(mu-q) + + mpres_mul (t3 ,t1 ,t2 ,n); + if ( !mpres_invert(t3,t3,n) ) { // t3 = 1 / ( (mu-q)*(1-(mu-q)) ) + mpres_gcd(f ,t3 ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + mpres_clear (t4,n); + return MORPHISM_FAIL; + } + + + mpres_add (t4, cHEll->mu, cHEll->q ,n); // t4 = mu+q + + mpres_mul (t2 ,t3 ,t2 ,n); // 1 / (mu-q) + mpres_mul (t2 ,t2 ,t4 ,n); // t2 = (mu+q) / (mu-q) = x2 + + mpres_mul (t3 ,t3 ,t1 ,n); // 1 / (1-(mu-q)) + mpres_ui_sub (t4 ,1 ,t4 ,n); + mpres_mul (t3 ,t3 ,t4 ,n); // t3 = (1-(mu+q)) / (1-(mu-q)) = x3 + + + mpres_mul (t2 ,t2 ,t2 ,n); // t2 = x2^2 + mpres_mul (t3 ,t3 ,t3 ,n); // t3 = x3^2 + + + mpres_mul (a6 ,t2 ,t3 ,n); // x2^2*x3^2 + mpres_add (T->A ,t2 ,t3 ,n); // x2^2+x3^2 + + + + + mpres_add_ui (A2 ,T->A ,1 ,n); + mpres_neg (A2 ,A2 ,n); // A2 = - (1+x2^2+x3^2) + mpres_add (T->A ,T->A ,a6 ,n); // A4 = x2^2*x3^2 + x2^2 + x3^2 + mpres_neg (a6 ,a6 ,n); // a6 = A6 = - x2^2*x3^2 + + + + mpres_mul_ui (a6 ,a6 ,27 ,n); // 27*A6 + mpres_mul (t1 ,A2 ,T->A ,n); + mpres_mul_ui (t1 ,t1 ,9 ,n); // 9*A2*A4 + mpres_sub (a6 ,a6 ,t1 ,n); // 27*A6 - 9*A2*A4 + mpres_mul (t1 ,A2 ,A2 ,n); // A2^2 + mpres_mul (t3 ,t1 ,A2 ,n); + mpres_mul_ui (t3 ,t3 ,2 ,n); + mpres_add (a6 ,a6 ,t3 ,n); // 27*A6 - 9*A2*A4 + 2*A2^3 + + mpres_mul_ui (T->A ,T->A ,3 ,n); + mpres_sub (T->A ,T->A ,t1 ,n); // 3*A4 - A2^2 + + + + + + mpres_sub_ui (t1 ,cHEll->mu ,1 ,n); + mpres_mul (t1 ,t1 ,cHEll->mu ,n); + mpres_mul (t1 ,t1 ,cHEll->q ,n); + mpres_neg (t1 ,t1 ,n); // t1 = rc = -q*mu*(mu-1) + + + mpalgres_mul_mpres (R ,B ,t1 ,pol ,n); // R = rc*B + mpres_mul_ui (t2 ,t1 ,3 ,n); // 3*rc + + + if ( !mpres_invert(t2,t2,n) ) { // t2 = 1 / ( 3*rc ) + mpres_gcd(f ,t2 ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + mpres_clear (t4,n); + return MORPHISM_FAIL; + } + + + + mpalgres_init (aTemp ,n); + + + + mpalgres_mul_mpres (aTemp ,invB ,t2 ,pol ,n); // 1 / ( 3*rc*B ) = 1/3R + mpalgres_mul (aTemp ,aTemp ,aTemp ,pol ,n); + mpalgres_mul_ui (aTemp ,aTemp ,3 ,pol ,n); // 1 / ( 3*R^2 ) + + + + mpalgres_mul_mpres (a4twist ,aTemp ,T->A ,pol ,n); + // a4' = (3*A4 - A2^2) / (3*R^2) + + mpres_mul (t1 ,t1 ,t2, n); // 1/3 + mpres_mul (T->A ,T->A ,t1 ,n); // a4 = (3*A4 - A2^2) / 3 + mpres_mul (t2 ,t1 ,t1, n); + mpres_mul (t1 ,t1 ,t2, n); // 1/27 + mpres_mul (a6 ,a6 ,t1 ,n); // a6 = ( 27*A6 - 9*A2*A4 + 2*A2^3 ) / 27 + + + + mpalgres_clear (aTemp ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + mpres_clear (t4,n); + return MORPHISM; +} + + + + + +/* + Get many coefficient of the short weierstrass curve + + the long weierstrass curve is given by + rc* y^2*z = ( x - z ) * ( x - x2^2 *z ) * ( x - x3^2 *z ) + = x^3 + A2 * x^2 + A4 * x + A6 + rc = -q*mu*(mu-1) + x2 = (mu+q) / (mu-q) x3 = (1-(mu+q)) / (1-(mu-q)) + get the curve + rc * y^2*z = x^3 + a4 * x * z^2 + a6 * z^3 + by (x,y,z) -> (x+A2/3*z,y,z) + a4 = A4 - A2^2/3 a6 = A6 - A2*A4/3 + 2*A2^3/27 + + It get the coefficient a4 and a6 of the short weierstrass curve + (a4 is put in T->A) + A2 of the long weierstrass form +*/ +int coeff_EllW(mpz_t f,mpmod_t n, + curve *T,mpres_t A2, mpres_t a6, + curveHyperEll cHEll ) { + + mpres_t t1,t2,t3,t4; + + mpres_init (t1,n); + mpres_init (t2,n); + mpres_init (t3,n); + + mpres_sub (t1, cHEll->mu, cHEll->q ,n); // t1 = mu-q + mpres_ui_sub (t2 ,1 ,t1 ,n); // t2 = 1-(mu-q) + + mpres_mul (t3 ,t1 ,t2 ,n); + if ( !mpres_invert(t3,t3,n) ) { // t3 = 1 / ( (mu-q)*(1-(mu-q)) ) + mpres_gcd(f ,t3 ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + return MORPHISM_FAIL; + } + + mpres_init (t4,n); + + mpres_add (t4, cHEll->mu, cHEll->q ,n); // t4 = mu+q + + + mpres_mul (t2 ,t3 ,t2 ,n); // 1 / (mu-q) + mpres_mul (t2 ,t2 ,t4 ,n); // t2 = (mu+q) / (mu-q) = x2 + + mpres_mul (t3 ,t3 ,t1 ,n); // 1 / (1-(mu-q)) + mpres_ui_sub (t4 ,1 ,t4 ,n); + mpres_mul (t3 ,t3 ,t4 ,n); // t3 = (1-(mu+q)) / (1-(mu-q)) = x3 + + + mpres_mul (t2 ,t2 ,t2 ,n); // t2 = x2^2 + mpres_mul (t3 ,t3 ,t3 ,n); // t3 = x3^2 + + + mpres_mul (a6 ,t2 ,t3 ,n); // x2^2*x3^2 + mpres_add (T->A ,t2 ,t3 ,n); // x2^2+x3^2 + + + mpres_clear (t4,n); + + + mpres_add_ui (A2 ,T->A ,1 ,n); + mpres_neg (A2 ,A2 ,n); // A2 = - (1+x2^2+x3^2) + mpres_add (T->A ,T->A ,a6 ,n); // A = A4 = x2^2*x3^2 + x2^2+x3^2 + mpres_neg (a6 ,a6 ,n); // a6 = A6 = - x2^2*x3^2 + + mpres_mul_ui (a6 ,a6 ,27 ,n); + mpres_mul (t1 ,A2 ,T->A ,n); + mpres_mul_ui (t1 ,t1 ,9 ,n); + mpres_sub (a6 ,a6 ,t1 ,n); // 27*A6 - 9*A2*A4 + mpres_mul (t1 ,A2 ,A2 ,n); // A2^2 + mpres_mul (t3 ,t1 ,t2 ,n); + mpres_mul_ui (t3 ,t3 ,2 ,n); + mpres_add (a6 ,a6 ,t3 ,n); // 27*A6 - 9*A2*A4 +2*A2^3 + + mpres_mul_ui (T->A ,T->A ,3 ,n); + mpres_sub (T->A ,T->A ,t1 ,n); // 3*A4 - A2^2 + + + mpres_set_ui (t1 ,3 ,n); + + if ( !mpres_invert(t1,t1,n) ) { // t1 = 1 /3 + mpres_gcd(f ,t1 ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + return MORPHISM_FAIL; + } + + mpres_mul (T->A ,T->A ,t1 ,n); // A = a4 = (3*A4 - A2^2) / 3 + mpres_mul (t2 ,t1 ,t1 ,n); + mpres_mul (t2 ,t2 ,t1 ,n); + mpres_mul (a6 ,a6 ,t2 ,n); // a6 = (27*A6 - 9*A2*A4 +2*A2^3) / 27 + + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + return MORPHISM; +} + + + + + + + + +/* + send a point from the long Weiestrass form to the short Weierstrass twisted form by the following morphisms: + (x,y,z) -> (x+A2/3*z,y,z) + (x,y,z) -> (x,y,R*z) + See function coeff_EllW_EllWtwist for notations +*/ +void EllW_to_EllWshort(mpmod_t n,mpalgpol_t pol, + mpalgres_t R,mpres_t A2, + mpalgres_t x,mpalgres_t y,mpalgres_t z, + mpalgres_t aX,mpalgres_t aY,mpalgres_t aZ) { + + mpalgres_t aT1,aT2,aT3,aT4; + + mpalgres_init (aT1 ,n); + mpalgres_init (aT2 ,n); + mpalgres_init (aT3 ,n); + mpalgres_init (aT4 ,n); + + mpalgres_mul_ui (aT1 ,aX ,3 ,pol ,n); + mpalgres_mul_mpres (aT4 ,aZ ,A2 ,pol ,n); + mpalgres_add (aT1 ,aT1 ,aT4 ,pol ,n); // x =3x+A2*z + mpalgres_mul_ui (aT2 ,aY ,3 ,pol ,n); // y =3y + mpalgres_mul_ui (aT3 ,aZ ,3 ,pol ,n); // z =3z + + mpalgres_mul (aT3 ,aT3 ,R ,pol ,n); // z *=R + + + mpalgres_set (x ,aT1 ,n); + mpalgres_set (y ,aT2 ,n); + mpalgres_set (z ,aT3 ,n); + + mpalgres_clear (aT1 ,n); + mpalgres_clear (aT2 ,n); + mpalgres_clear (aT3 ,n); + mpalgres_clear (aT4 ,n); + +} + + + + + + + +/* + Send the point (x::z) from the hyperelliptic curve to the elliptic curve in + short weierstrass form defined by q and put the result in T. + Don't touch to x,z + + (x::z) -> ( ( x-(mu+q)z )^2 :: ( x-(mu-q)z )^2 ) + (x::z) -> ( 3x + A2*z :: 3z ) + (x::z) -> (x/z :: 1) + get B = x^3 + a4*x + a6 + Set (T->x,T->y)=(x/B,y/B) and T->A = a4/B^2 +*/ +int HEll_EllW_degree1 (mpz_t f,mpmod_t n, + mpres_t x,mpres_t z, + curve *T,mpres_t A2, mpres_t a6, + curveHyperEll cHEll ) { + + mpres_t t; + mpres_t X,Z; + + mpres_init (t,n); + mpres_init (X,n); + mpres_init (Z,n); + + mpres_add (X ,cHEll->mu ,cHEll->q ,n); + mpres_mul (X ,X ,z ,n); + mpres_sub (X ,x ,X ,n); + mpres_mul (X ,X ,X ,n); // ( x-(mu+q)z )^2 + + mpres_sub (Z ,cHEll->mu ,cHEll->q ,n); + mpres_mul (Z ,Z ,z ,n); + mpres_sub (Z ,x ,Z ,n); + mpres_mul (Z ,Z ,Z ,n); // ( x-(mu-q)z )^2 + + + + mpres_mul_ui (X ,X ,3 ,n); + mpres_mul (t ,Z ,A2 ,n); + mpres_add (X ,X ,t ,n); // 3x + A2*z + mpres_mul_ui (Z ,Z ,3 ,n); // 3z + + if ( mpres_is_zero (Z ,n) ){ + mpres_gcd (f ,Z ,n); + mpres_clear (t,n); + mpres_clear (X,n); + mpres_clear (Z,n); + return MORPHISM_FOUND_ZERO_CURVE; + } + if ( !mpres_invert(t ,Z ,n) ){ + mpres_gcd (f ,Z ,n); + mpres_clear (t,n); + mpres_clear (X,n); + mpres_clear (Z,n); + return MORPHISM_STEP1; + } + + mpres_mul (X ,X ,Z ,n); + + mpres_mul (t ,X ,X ,n); + mpres_mul (t ,t ,X ,n); + mpres_mul (Z ,T->A ,X ,n); + mpres_add (Z ,Z ,t ,n); + mpres_add (Z ,Z ,a6 ,n); // x^3 + a4*x + a6 + + if ( !mpres_invert(T->y ,Z ,n) ){ + mpres_gcd (f ,Z ,n); + mpres_clear (t,n); + mpres_clear (X,n); + mpres_clear (Z,n); + return MORPHISM_FAIL; + } + + mpres_mul (T->x ,T->x ,T->y ,n); + mpres_mul (T->A ,T->A ,T->y ,n); + mpres_mul (T->A ,T->A ,T->y ,n); + + + mpres_clear (t,n); + mpres_clear (X,n); + mpres_clear (Z,n); + return MORPHISM; +} + + + +/* + double the point on the weierstrass curve + (x,y)->( la^2-2x , -la*(la^2-2x)-mu ) + la = (3x^2+a4)/(2y) mu = y-x*la + return 0 if failed +*/ +int double_short_weierstrass (mpz_t f,mpmod_t n,curve *T) { + mpres_t t1,t2,t3; + + mpres_init (t1 ,n); + mpres_init (t2 ,n); + mpres_init (t3 ,n); + + mpres_mul (t1 ,T->x ,T->x ,n); + mpres_mul_ui (t1 ,t1 ,3 ,n); + mpres_add (t1 ,t1 ,T->A ,n); // (3x^2+a4) + + mpres_mul_ui (t2 ,T->y ,2 ,n); + if ( !mpres_invert(t2 ,t2 ,n) ){ // 1/2y + mpres_gcd (f ,t2 ,n); + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + return 0; + } + + mpres_mul (t1 ,t1 ,t2 ,n); // la = (3x^2+a4)/(2y) + mpres_mul (t2 ,T->x ,t1 ,n); + mpres_sub (t2 ,T->y ,t2 ,n); // mu = y-x*la + + mpres_mul_ui (T->x ,T->x ,2 ,n); + mpres_mul (t3 ,t1 ,t1 ,n); + mpres_sub (T->x ,t3 ,T->x ,n); // la^2-2x + + mpres_mul (T->y ,t1 ,T->x ,n); + mpres_add (T->y ,T->y ,t2 ,n); + mpres_neg (T->y ,T->y ,n); // -la*(la^2-2x)-mu + + mpres_clear (t1,n); + mpres_clear (t2,n); + mpres_clear (t3,n); + return 1; +} + + + + + + +/* + "add" the two points on the short Weierstrass twisted curve + We only want the (x::z) coordinate + y^2*z = x^3 + a4' * x * z^2 + a6' * z^3 + a4' = a4/R^2 a6' = a6/R^3 + Then go back to the untwisted curve + y^2*z = x^3 + a4 * x * z^2 + a6 * z^3 + (x,y,z) -> (x*R ,?, z); + Scale the y coordinate and come back to non projective curve + (x::z) -> (x/z,?) + compute B = x^3 + a4 * x + a6 (i.e. y^2) + (x,?) -> (X,Y) + X = x/B Y=1/B T->A = a4/B^2 +*/ +int addW_and_scale (mpz_t f,mpmod_t n,mpalgpol_t pol, + curve *T,mpres_t a6, + mpalgres_t X1 ,mpalgres_t Y1 ,mpalgres_t Z1, + mpalgres_t X2 ,mpalgres_t Y2 ,mpalgres_t Z2, + mpalgres_t a4twist ,mpalgres_t R) { + int test; + mpalgres_t aT1,aT2; + mpres_t temp; + mpalgres_t La,Den; + mpalgres_t X3,Z3; + mpz_t F[DEGREE_ALGEBRA]; + + mpz_init (F[0]); + mpz_init (F[1]); + mpalgres_init (X3 ,n); + mpalgres_init (Z3 ,n); + + if (mpalgres_is_zero(Z1 ,pol ,n) ) { // the first point is zero + if (mpalgres_is_zero(Z2 ,pol ,n) ) {// the two points are zero + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FOUND_ZERO_CURVE; + } + + // Maybe the two points are zero modulo a factor of n + mpalgres_gcd (F,Z2,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { + mpz_set (f,F[0]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + + mpalgres_set (X3,X2,n); + mpalgres_set (Z3,Z2,n); + + } + else { + // Maybe the first points is zero modulo a factor of n + mpalgres_gcd (F,Z1,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { + mpz_set (f,F[0]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FAIL; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FAIL; + } + + // Now the first point is not the zero + + if (mpalgres_is_zero(Z2 ,pol ,n) ) { // the second point is zero + mpalgres_set (X3,X1,n); + mpalgres_set (Z3,Z1,n); + } + else { + // Maybe the second points is zero modulo a factor of n + mpalgres_gcd (F,Z2,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { + mpz_set (f,F[0]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FAIL; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FAIL; + } + + + // general case + + mpalgres_init (Den,n); + mpalgres_init (aT1,n); + mpalgres_init (aT2,n); + + mpalgres_mul (Den ,Z1 ,X2 ,pol ,n); + mpalgres_mul (aT1 ,X1 ,Z2 ,pol ,n); + mpalgres_sub (Den ,Den ,aT1 ,pol ,n); // Z1*X2-Z2*X1 + + if (mpalgres_is_zero(Den ,pol ,n) ) { // X1/Z1 = X2/Z2 + mpalgres_mul (aT1 ,Y1 ,Z2 ,pol ,n); + mpalgres_mul (aT2 ,Z1 ,Y2 ,pol ,n); + mpalgres_add (aT1 ,aT1 ,aT2 ,pol ,n); + + if (mpalgres_is_zero(aT1 ,pol ,n) ) { // Y1/Z1 = -Y2/Z2 + // we add a point and its opposite + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FOUND_ZERO_CURVE; + } + + // Maybe it is the case modulo a factor of n + mpalgres_gcd (F,aT1,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { + mpz_set (f,F[0]); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + + + + // Y1/Z1 = Y2/Z2 // doubling case + + if (mpalgres_is_zero(Y1 ,pol ,n)) { // doubling a 2-torsion point + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FOUND_ZERO_CURVE; + } + + // Maybe it is the case modulo a factor of n + mpalgres_gcd (F,Y1,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { + mpz_set (f,F[0]); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + + // general doubling case + + mpalgres_init (La,n); + + + + mpalgres_mul (Den,Y1,Z1,pol,n); + mpalgres_mul_ui (Den,Den,2,pol,n); // Den = 2*Y1*Z1 + + mpalgres_mul (La ,X1 ,X1 ,pol,n); + mpalgres_mul_ui (La ,La ,3 ,pol,n); + mpalgres_mul (aT1 ,Z1 ,Z1 ,pol ,n); + mpalgres_mul (aT1 ,aT1 ,a4twist ,pol ,n); + mpalgres_add (La,La,aT1 ,pol,n); // La = 3*X1^2 + a4'*Z1^2 + + mpalgres_mul (X3 ,La ,La ,pol,n); + mpalgres_mul (X3 ,X3 ,Z1 ,pol,n); + + mpalgres_mul (Z3 ,Den ,Den ,pol,n); // Den^2 + mpalgres_mul (aT1 ,Z3 ,X1 ,pol ,n); + mpalgres_mul_ui (aT1 ,aT1 ,2 ,pol ,n); + mpalgres_sub (X3 ,X3 ,aT1 ,pol ,n); // X3 = La^2*Z1 - 2*X1*Den^2 + + mpalgres_mul (Z3 ,Z3 ,Z1 ,pol ,n); // Z3 = Z1*Den^2 + + + + mpalgres_clear (La,n); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + + } + + else { + + // Maybe we have to double one point modulo a factor of n + mpalgres_gcd (F,Den,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { + mpz_set (f,F[0]); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FAIL; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpalgres_clear (Den,n); + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_FAIL; + } + + + + + // adding case + + mpalgres_init (La,n); + + + + mpalgres_mul (La ,Z1 ,Y2 ,pol ,n); + mpalgres_mul (aT1 ,Y1 ,Z2 ,pol ,n); + mpalgres_sub (La ,La ,aT1 ,pol ,n); // La = Z1*Y2-Y1*Z2 + + + mpalgres_mul (aT2 ,Z1 ,Z2, pol ,n); // Z1*Z2 + mpalgres_mul (La ,La ,La ,pol ,n); + mpalgres_mul (La ,La ,aT2 ,pol ,n); // La^2 * Z1*Z2 + + mpalgres_mul (X3 ,Z1 ,X2 ,pol ,n); + mpalgres_mul (aT1 ,X1 ,Z2 ,pol ,n); + mpalgres_add (X3 ,X3 ,aT1 ,pol ,n); // Z1*X2 + X1*Z2 + + mpalgres_mul (Z3 ,Den ,Den ,pol ,n); // Den^2 + mpalgres_mul (X3 ,X3 ,Z3 ,pol,n); + mpalgres_sub (X3 ,La ,X3 ,pol ,n); + // X3 = La^2*Z1*Z2 - (Z1*X2+X1*Z2)*Den^2 + + mpalgres_mul (Z3 ,Z3 ,aT2 ,pol ,n); // Z3 = Z1*Z2*Den^2 + + + + mpalgres_clear (aT1,n); + mpalgres_clear (aT2,n); + mpalgres_clear (La,n); + mpalgres_clear (Den,n); + + } + } + } + + + + // We have the point (X3::Z3) on the short Weierstrass twisted curve + + mpalgres_mul (X3, X3 ,R ,pol,n); + + + + + if ( mpalgres_is_zero (Z3 ,pol ,n) ) { // Z3 =0 + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3,n); + mpalgres_clear (Z3,n); + return MORPHISM_FOUND_ZERO_CURVE; + } + mpalgres_gcd (F,Z3,n); + if ( mpz_cmp_ui (F[0],1)!=0 ) { // maybe Z3 is 0 modulo a factor of n + mpz_set (f,F[0]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + if ( mpz_cmp_ui (F[1],1)!=0 ) { + mpz_set (f,F[1]); + mpz_clear (F[0]); + mpz_clear (F[1]); + mpalgres_clear (X3 ,n); + mpalgres_clear (Z3 ,n); + return MORPHISM_STEP1; + } + mpz_clear (F[0]); + mpz_clear (F[1]); + + + test = mpalgres_invert (Z3 ,Z3 ,pol,n,f); + if (test == -1) { + mpalgres_clear (X3,n); + mpalgres_clear (Z3,n); + return MORPHISM_ERROR; + } + if (test == 0) { + mpalgres_clear (X3,n); + mpalgres_clear (Z3,n); + return MORPHISM_FAIL; + } + + mpalgres_mul (X3 ,X3 ,Z3 ,pol ,n); // X3/Z3 + + + if ( !mpres_is_zero(X3[1],n) ) { + // This should never happen since X3 should be rationnal + mpalgres_clear (X3,n); + mpalgres_clear (Z3,n); + return MORPHISM_ERROR; + } + + mpz_set (T->x ,X3[0]); + + + + // Compute B=x^3 + a4 * x + a6 (i.e. y^2) + mpres_init(temp,n); + + mpres_mul (T->y ,T->x ,T->x ,n); + mpres_mul (T->y ,T->y ,T->x ,n); // x^3 + mpres_mul (temp ,T->A ,T->x ,n); // a4 * x + mpres_add (T->y ,T->y ,temp ,n); + mpres_add (T->y ,T->y ,a6 ,n); // B=x^3 + a4 * x + a6 + + mpres_clear (temp,n); + + + if ( !mpres_invert(T->y ,T->y ,n) ) { // 1/B + mpres_gcd(f,T->y,n); + mpalgres_clear (X3,n); + mpalgres_clear (Z3,n); + return MORPHISM_FAIL; + } + + mpres_mul (T->x ,T->x ,T->y ,n); // X/B + mpres_mul (T->A ,T->A ,T->y ,n); + mpres_mul (T->A ,T->A ,T->y ,n); // a4/B^2 + + mpalgres_clear (X3,n); + mpalgres_clear (Z3,n); + return MORPHISM; +} + + diff -Nru gmp-ecm-7.0.4+ds/hecm/morphismes.h gmp-ecm-7.0.5+ds/hecm/morphismes.h --- gmp-ecm-7.0.4+ds/hecm/morphismes.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/morphismes.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,75 @@ +#ifndef _MORPHISMES_H +#define _MORPHISMES_H + +#include "../ecm-impl.h" +#include "auxi.h" + +#define MORPHISM_ERROR -1 +#define MORPHISM_FAIL 0 // The computation of the morphisms failled +#define MORPHISM_STEP1 1 // We found a factor at the end of stage 1 +#define MORPHISM 2 // The computation of the morphisms is a success +#define MORPHISM_FOUND_ZERO_CURVE_1 3 // We found the zero of the 1st curve +#define MORPHISM_FOUND_ZERO_CURVE_2 4 // We found the zero of the 2nd curve +#define MORPHISM_FOUND_ZERO_CURVE_1_AND_2 5 // We found the zero of both curves +#define MORPHISM_FOUND_ZERO_CURVE 6 // We found the zero of one of the curve + + + +struct DivMumfordU_s { + int degree; // 0,1 ou 2 + mpres_t u0; + mpres_t u1; +}; +typedef struct DivMumfordU_s DivMumfordU[1]; + +void DivMumfordU_init (DivMumfordU divU, mpmod_t n ); +void DivMumfordU_clear (DivMumfordU divU, mpmod_t n ); + + +struct DivMumfordV_s { + mpres_t V0; + mpres_t V1; + mpres_t v1v0; +}; +typedef struct DivMumfordV_s DivMumfordV[1]; + +void DivMumfordV_init (DivMumfordV divV, mpmod_t n ); +void DivMumfordV_clear (DivMumfordV divV, mpmod_t n ); + + + +#include "generation.h" +#include "ariKS.h" + + + +int DivMumford (mpz_t f,mpmod_t n,DivMumfordU divU,DivMumfordV divV,ksPoint P,thetaCst th,curveHyperEll cHEll); + +int DivMumfordDegree2 (mpz_t f,mpmod_t n,DivMumfordU divU,DivMumfordV divV,ksPoint P,thetaCst th,curveHyperEll cHEll, mpres_t T13p, mpres_t T14p, mpres_t T16p); + +int degree2_case_y1_equal_zero (mpz_t f,mpmod_t n,curve *T1, curve *T2,DivMumfordU divU, DivMumfordV divV, curveHyperEll cHEll); + + +int jac_to_EllW (mpz_t f,mpmod_t n,curve *T1, curve *T2,DivMumfordU divU, DivMumfordV divV,curveHyperEll cHEll); + +int jac_to_EllW_Degree1 (mpz_t f,mpmod_t n,curve *T1, curve *T2,DivMumfordU divU, curveHyperEll cHEll); + +int jac_to_EllW_Degree2 (mpz_t f,mpmod_t n,curve *T1, curve *T2,DivMumfordU divU, DivMumfordV divV,curveHyperEll cHEll); + +void HEll_to_EllW(mpmod_t n,mpalgpol_t pol,mpalgres_t aX2,mpalgres_t aY2,mpalgres_t aZ2,mpalgres_t aX,mpalgres_t aY,mpalgres_t aZ,curveHyperEll cHEll ); + +int coeff_EllW_EllWtwist(mpz_t f,mpmod_t n,mpalgpol_t pol,curve *T,mpres_t a6,mpres_t A2,mpalgres_t R, mpalgres_t a4twist,mpalgres_t B,mpalgres_t invB,curveHyperEll cHEll ); + +int coeff_EllW(mpz_t f,mpmod_t n,curve *T,mpres_t A2, mpres_t a6,curveHyperEll cHEll ); + + +void EllW_to_EllWshort(mpmod_t n,mpalgpol_t pol,mpalgres_t R,mpres_t A2,mpalgres_t x,mpalgres_t y,mpalgres_t z,mpalgres_t aX,mpalgres_t aY,mpalgres_t aZ); + + +int HEll_EllW_degree1 (mpz_t f,mpmod_t n,mpres_t x,mpres_t z,curve *T,mpres_t A2, mpres_t a6,curveHyperEll cHEll ); + +int double_short_weierstrass (mpz_t f,mpmod_t n,curve *T); + +int addW_and_scale (mpz_t f,mpmod_t n,mpalgpol_t pol,curve *T,mpres_t a6,mpalgres_t X1 ,mpalgres_t Y1 ,mpalgres_t Z1,mpalgres_t X2 ,mpalgres_t Y2 ,mpalgres_t Z2,mpalgres_t a4twist ,mpalgres_t R); + +#endif diff -Nru gmp-ecm-7.0.4+ds/hecm/stage1HECM.c gmp-ecm-7.0.5+ds/hecm/stage1HECM.c --- gmp-ecm-7.0.4+ds/hecm/stage1HECM.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/stage1HECM.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,356 @@ +#include "../ecm-impl.h" +#include "hecm.h" +#include "generation.h" +#include "ariKS.h" +#include "morphismes.h" +#include "auxi.h" + +/* + stage 1 with the normal parametrization + input: n the number to factored + f variable for potential factor + k =lcm(2,..,B1) + para parameters of the curve + T1,T2 for the underlying elliptic curve + options options given by the user + We begin by generating the curve and the point + (if needed and authorized by changing the parameters) + We compute k*P + We send the points on the elliptic curves + We check the existence of factors of n +*/ +int hecm1Normal (mpz_t f,mpmod_t n, + mpz_t k, + paraGenCurve para, + curve *T1,curve *T2, + optionsHECM options) { + + + int test; + + curveHyperEll cHEll; + ksPoint P; + DivMumfordU divU; + DivMumfordV divV; + thetaCst th; + ksCstPourMul cMul; + + mpres_t g; + mpz_t s; + + + curveHyperEll_init (cHEll,n); + ksPoint_init (P,n); + thetaCst_init (th,n); + ksCstPourMul_init (cMul,n); + mpres_init (g,n); + + + // generate the curve + if (options->curveSpecified == TRUE) { // we want fixed s or nJacobi + if (para->nJacobi == 0) { + para->nJacobi = (rand()%(N_JACOBI_MAX_p1-N_JACOBI_MIN))+N_JACOBI_MIN; + } + + test = generateNormalCurveSpecified(f,n,para,th,cHEll,P,cMul); + + } + else { + test = generateNormalCurve(f,n,para,th,cHEll,P,cMul,options); + } + + if (test == GENERATION_FACTOR_FOUND ) { + // f contient bien un diviseur de n ie f != 0 mod n + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + ksCstPourMul_clear (cMul,n); + thetaCst_clear (th,n); + mpres_clear (g,n); + return HECM_FACTOR_FOUND_GENERATION; + } + else if (test == GENERATION_FAIL) { + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + ksCstPourMul_clear (cMul,n); + thetaCst_clear (th,n); + mpres_clear (g,n); + return HECM_GENERATION_FAIL; + } + + // check if s is in the good interval + mpz_init (s); + mpres_get_z (s,para->s,n); + if ( mpz_cmp (s,options->heightMax) > 0 ) { + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + ksCstPourMul_clear (cMul,n); + thetaCst_clear (th,n); + mpz_clear(s); + mpres_clear (g,n); + return HECM_PARAM_TOO_BIG; + } + mpz_clear (s); + + + + // compute k*P + mulKS(P,cMul,th->be,th->ga,k,n); + + + + + ksCstPourMul_clear (cMul,n); + + + DivMumfordU_init (divU,n); + DivMumfordV_init (divV,n); + + + // send the point on the hyperelliptic curve + test = DivMumford (f,n,divU,divV,P,th,cHEll); + mpres_set_z (g,f,n); + + ksPoint_clear (P,n); + thetaCst_clear (th,n); + + if (test == MORPHISM_FAIL ) { + if ( mpres_is_zero(g,n) == 1 ) { // f=0. Let try again + mpres_clear (g,n); + curveHyperEll_clear (cHEll,n); + DivMumfordU_clear (divU,n); + DivMumfordV_clear (divV,n); + return HECM_NO_FACTOR_FOUND; + } + else { // f != 0,n i.e. f is a real divisor of n + mpres_clear (g,n); + curveHyperEll_clear (cHEll,n); + DivMumfordU_clear (divU,n); + DivMumfordV_clear (divV,n); + return HECM_FACTOR_FOUND_MORPHISM; + } + } + + + // send the point on the elliptic curves + // test = jacCEllJacobi (f,n,T1,T2,divU,divV,cHEll); + test = jac_to_EllW (f,n,T1,T2,divU,divV,cHEll); + + mpres_set_z (g,f,n); + + curveHyperEll_clear (cHEll,n); + DivMumfordU_clear (divU,n); + DivMumfordV_clear (divV,n); + + + + + + if (test == MORPHISM_FAIL ) { + if ( mpres_is_zero(g,n) == 1 ) { // f=0. Let's try again + mpres_clear (g,n); + return HECM_NO_FACTOR_FOUND; + } + else { // f != 0,n i.e. f is a real divisor of n + mpres_clear (g,n); + return HECM_FACTOR_FOUND_MORPHISM; + } + } + else if (test == MORPHISM_ERROR) { + mpres_clear (g,n); + return HECM_ERROR; + } + else if (test == MORPHISM_FOUND_ZERO_CURVE_1) { + mpres_clear (g,n); + return HECM_FOUND_ZERO_CURVE_1; + } + else if (test == MORPHISM_FOUND_ZERO_CURVE_2) { + mpres_clear (g,n); + return HECM_FOUND_ZERO_CURVE_2; + } + else if (test == MORPHISM_FOUND_ZERO_CURVE_1_AND_2) { + mpres_clear (g,n); + return HECM_FOUND_ZERO_CURVE_1_AND_2; + } + else if (test == MORPHISM_STEP1) { + mpres_clear (g,n); + return HECM_FACTOR_FOUND_STEP1; + } + else { + mpres_clear (g,n); + return HECM_NO_FACTOR_FOUND; + } + +} + + + + + +/* + stage 1 with the small parameters + input: n the number to factored + f variable for potential factor + k =lcm(2,..,B1) + para parameters of the curve + T1,T2 for the underlying elliptic curve + options options given by the user + We begin by generating the curve and the point + (if needed and authorized by changing the parameters) + We compute k*P + We send the points on the elliptic curves + We check the existence of factors of n +*/ +int hecm1LowParam (mpz_t f,mpmod_t n, + mpz_t k, + paraGenCurve para , + curve *T1,curve *T2, + optionsHECM options) { + + + int test; + + curveHyperEll cHEll; + ksPoint P; + DivMumfordU divU; + DivMumfordV divV; + thetaCst th; + ksSmallConstPourMul cMul; + + mpres_t g; + + + curveHyperEll_init (cHEll,n); + ksPoint_init (P,n); + thetaCst_init (th,n); + mpres_init (g,n); + + + + if (options->curveSpecified == TRUE) { + test = generateCurveSmallParamSpecified (f,n,para,th,cHEll,P,cMul); + } + else { + test = generateCurveSmallParam (f,n,para,th,cHEll,P,cMul,para->a,para->b,para->nJacobi,options); + } + + + if (test == GENERATION_FACTOR_FOUND ) { + // f contient bien un diviseur de n + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + thetaCst_clear (th,n); + mpres_clear (g,n); + return HECM_FACTOR_FOUND_GENERATION; + } + else if (test == GENERATION_FAIL) { + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + thetaCst_clear (th,n); + mpres_clear (g,n); + return HECM_GENERATION_FAIL; + } + else if (test == GENERATION_PARAM_TOO_BIG) { + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + thetaCst_clear (th,n); + mpres_clear (g,n); + return HECM_PARAM_TOO_BIG; + } + + + if ( ( mpz_cmp (para->a,options->heightMax) > 0 ) || ( mpz_cmp (para->b,options->heightMax) > 0 ) ) { + curveHyperEll_clear (cHEll,n); + ksPoint_clear (P,n); + thetaCst_clear (th,n); + mpres_clear (g,n); + return HECM_PARAM_TOO_BIG; + } + + + + + // compute k*P + mulKSsmallParam (P,cMul,th->be,th->ga,k,n); + + + + + DivMumfordU_init (divU,n); + DivMumfordV_init (divV,n); + + + + // send the point on the hyperelliptic curve + test = DivMumford (f,n,divU,divV,P,th,cHEll); + mpres_set_z (g,f,n); + + ksPoint_clear (P,n); + thetaCst_clear (th,n); + + if (test == MORPHISM_FAIL ) { + if ( mpres_is_zero(g,n) == 1 ) { // f=0. Let's try again + mpres_clear (g,n); + curveHyperEll_clear (cHEll,n); + DivMumfordU_clear (divU,n); + DivMumfordV_clear (divV,n); + return HECM_NO_FACTOR_FOUND; + } + else { // f != 0,n i.e. f is a real divisor of n + mpres_clear (g,n); + curveHyperEll_clear (cHEll,n); + DivMumfordU_clear (divU,n); + DivMumfordV_clear (divV,n); + return HECM_FACTOR_FOUND_MORPHISM; + } + } + + + + // send the point on the elliptic curves + // test = jacCEllJacobi (f,n,T1,T2,divU,divV,cHEll); + test = jac_to_EllW (f,n,T1,T2,divU,divV,cHEll); + + mpres_set_z (g,f,n); + + curveHyperEll_clear (cHEll,n); + DivMumfordU_clear (divU,n); + DivMumfordV_clear (divV,n); + + + + if (test == MORPHISM_FAIL ) { + if ( mpres_is_zero(g,n) == 1 ) { // f=0. Let's try again + mpres_clear (g,n); + return HECM_NO_FACTOR_FOUND; + } + else { // f != 0,n i.e. f is a real divisor of n + mpres_clear (g,n); + return HECM_FACTOR_FOUND_MORPHISM; + } + } + else if (test == MORPHISM_ERROR) { + mpres_clear (g,n); + return HECM_ERROR; + } + else if (test == MORPHISM_FOUND_ZERO_CURVE_1) { + mpres_clear (g,n); + return HECM_FOUND_ZERO_CURVE_1; + } + else if (test == MORPHISM_FOUND_ZERO_CURVE_2) { + mpres_clear (g,n); + return HECM_FOUND_ZERO_CURVE_2; + } + else if (test == MORPHISM_FOUND_ZERO_CURVE_1_AND_2) { + mpres_clear (g,n); + return HECM_FOUND_ZERO_CURVE_1_AND_2; + } + else if (test == MORPHISM_STEP1) { + mpres_clear (g,n); + return HECM_FACTOR_FOUND_STEP1; + } + else { + mpres_clear (g,n); + return HECM_NO_FACTOR_FOUND; + } + +} diff -Nru gmp-ecm-7.0.4+ds/hecm/stage2HECM.c gmp-ecm-7.0.5+ds/hecm/stage2HECM.c --- gmp-ecm-7.0.4+ds/hecm/stage2HECM.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/hecm/stage2HECM.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,100 @@ +#include "../ecm-impl.h" +#include "../ecm.h" + +#include "hecm.h" + +/* wrapper for GMP-ECM stage2 for a curve in Weierstrass form + + y^2 = x^2 + A * x + B + + where B is implicitly defined by y^2 - (x^2 + A * x) mod n. +*/ +int +ecmfactor2 (mpz_t f, mpz_t n, mpz_t A, mpz_t x, mpz_t y, mpz_t B2) +{ + ecm_params q; + int res; + + ecm_init (q); + + q->sigma_is_A = -1; // indicates that we give a curve in Weierstrass form + mpz_set (q->sigma, A); + mpz_set (q->x, x); + mpz_set (q->go, y); + mpz_set (q->B2, B2); + + res = ecm_factor (f, n, 0.0, q); + + ecm_clear (q); + + return res; +} + + + +int hecm2 (mpz_t f, mpmod_t n, curve* T1, curve* T2, mpz_t B2) { + int test; + mpres_t g; + mpz_t x,y,A; + mpres_init (g,n); + mpz_init(x); + mpz_init(y); + mpz_init(A); + + + + // stage 2 for the first elliptic curve + mpres_get_z (A,T1->A,n); + mpres_get_z (x,T1->x,n); + mpres_get_z (y,T1->y,n); + test = ecmfactor2 (f, n->orig_modulus, A, x, y, B2); + mpres_set_z (g,f,n); + + if (test != ECM_NO_FACTOR_FOUND) { + if ( mpres_is_zero(g,n) == 1 ) { // f=0. + mpres_clear (g,n); + mpz_clear(x); + mpz_clear(y); + mpz_clear(A); + return HECM_FOUND_ZERO_CURVE_1; + } + else { // f != 0,n i.e. f is a real divisor of n + mpres_clear (g,n); + mpz_clear(x); + mpz_clear(y); + mpz_clear(A); + return HECM_FACTOR_FOUND_STEP2; + } + } + + mpres_get_z (A,T2->A,n); + mpres_get_z (x,T2->x,n); + mpres_get_z (y,T2->y,n); + // stage 2 for the second elliptic curve + test = ecmfactor2 (f, n->orig_modulus, A, x, y, B2); + mpres_set_z (g,f,n); + + if (test != ECM_NO_FACTOR_FOUND) { + if ( mpres_is_zero(g,n) == 1 ) { // f=0. + mpres_clear (g,n); + mpz_clear(x); + mpz_clear(y); + mpz_clear(A); + return HECM_FOUND_ZERO_CURVE_2; + } + else { // f != 0,n i.e. f is a real divisor of n + mpres_clear (g,n); + mpz_clear(x); + mpz_clear(y); + mpz_clear(A); + return HECM_FACTOR_FOUND_STEP2; + } + } + + + mpres_clear (g,n); + mpz_clear(x); + mpz_clear(y); + mpz_clear(A); + return HECM_NO_FACTOR_FOUND; +} diff -Nru gmp-ecm-7.0.4+ds/INSTALL-ecm gmp-ecm-7.0.5+ds/INSTALL-ecm --- gmp-ecm-7.0.4+ds/INSTALL-ecm 2016-02-26 10:31:47.000000000 +0000 +++ gmp-ecm-7.0.5+ds/INSTALL-ecm 2022-06-06 14:16:49.000000000 +0000 @@ -64,8 +64,10 @@ Note 3: If you want to use George Woltman's GWNUM library for speeding up factoring base 2 numbers, obtain the source file from - (on December 2011 the latest source is - source272.zip), build the gwnum library for your operating system, then use + (on May 2020 the latest source is + p95v298b7.source.zip), build the gwnum library for your operating system + (for example under 64-bit Linux go to the gwnum directory and type + "make -f make64"), then use: $ ./configure --with-gwnum= @@ -73,6 +75,10 @@ gwnum.h and related header files. The source file of the gwnum library is available at . + Note: Jonathan Crombie reported an error (Segmentation fault) with GWNUM + 29.6 with some inputs. George Woltman says the error lies in the giants + half-GCD code. This issue is fixed in p95v298b7. + 2) compile the program with: $ make @@ -132,7 +138,7 @@ * [reported by Sam Rawlins] with MinGW under Windows XP (32-bit), the compilation fails in spv.c. A fix seems to add -msse2 to CFLAGS. - See http://lists.gforge.inria.fr/pipermail/ecm-discuss/2010-June/004077.html + See https://sympa.inria.fr/sympa/arc/ecm-discuss/2010-06/msg00000.html * GCC 4.4 might miscompile GMP-ECM on Sparc, see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45559 for more details. The problem is due in fact to a bug in the Linux kernel. A fix is to use @@ -174,7 +180,8 @@ Windows login name. 5) Download GMP-ECM (if you do not have it already) from - http://ecm.gforge.inria.fr/ and place it in your home folder as well. + https://gitlab.inria.fr/zimmerma/ecm and place it in your home folder + as well. 6) Start the MinSys up again from the desktop and type tar -xvzf gmp-6.1.0.tar.gz @@ -208,6 +215,6 @@ - the output you get. -Then send your bug report at . +Then send your bug report at . This is a public list, with archives available at -. +. diff -Nru gmp-ecm-7.0.4+ds/ks-multiply.c gmp-ecm-7.0.5+ds/ks-multiply.c --- gmp-ecm-7.0.4+ds/ks-multiply.c 2016-08-22 11:10:25.000000000 +0000 +++ gmp-ecm-7.0.5+ds/ks-multiply.c 2022-06-06 14:16:49.000000000 +0000 @@ -547,10 +547,10 @@ unsigned int ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n) { +#ifdef FFT_WRAP mp_size_t t, s; unsigned long i, m; -#ifdef FFT_WRAP t = mpz_sizeinbase (n, 2); s = t * 2 + 1; for (i = k - 1; i; s++, i >>= 1); diff -Nru gmp-ecm-7.0.4+ds/listz_handle.c gmp-ecm-7.0.5+ds/listz_handle.c --- gmp-ecm-7.0.4+ds/listz_handle.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/listz_handle.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,557 @@ +#define _GNU_SOURCE +#include "config.h" +#include +#include +#ifdef HAVE_FCNTL_H +#include +#endif +#ifdef _OPENMP +#include +#endif +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) +/* For EAGAIN etc. */ +#include +#endif +#include "listz_handle.h" + +/* #define TRACE_ITER yes */ + +/* Init a listz_handle_t to store up to len residues (modulo m). + If filename != NULL, uses disk storage, otherwise memory. + Returns NULL if something goes wrong (i.e., if a memory allocation + or opening a file fails) */ + +listz_handle_t +listz_handle_init (const char *filename, const uint64_t len, const mpz_t m) +{ + listz_handle_t F; + void *buf; + + F = malloc (sizeof (_listz_handle_t)); + if (F == NULL) + return NULL; + + /* Find out how many file_word_t's m has */ + buf = (file_word_t *) mpz_export (NULL, &F->words, -1, sizeof(file_word_t), + -1, 0, m); + if (buf == NULL) + { + free (F); + return NULL; + } + free(buf); + + F->len = len; + if (filename == NULL) + { + F->storage = 0; /* Memory storage */ + F->data.mem = init_list2 (len, mpz_sizeinbase(m, 2)); + if (F->data.mem == NULL) + { + free (F); + F = NULL; + } + } else { + F->storage = 1; /* Disk storage */ + F->filename = (char *) malloc ((strlen (filename) + 1) * sizeof(char)); + if (F->filename == NULL) + { + free (F); + return NULL; + } + strcpy (F->filename, filename); + F->data.file = fopen (F->filename, "rb+"); + if (F->data.file == NULL) + F->data.file = fopen (F->filename, "wb+"); + if (F->data.file == NULL) + { + free (F->filename); + free (F); + return NULL; + } +#ifdef HAVE_FALLOCATE + fallocate (fileno(F->data.file), 0, (off_t) 0, + F->words * sizeof(file_word_t) * len); +#endif +#if defined(HAVE_SETVBUF) && defined(HAVE_AIO_READ) + /* Set to unbuffered mode as we use aio_*() functions for reading + in the background */ + setvbuf (F->data.file, NULL, _IONBF, 0); +#endif + } + + return F; +} + + +listz_handle_t +listz_handle_from_listz (const listz_t l, const uint64_t len, const mpz_t m) +{ + listz_handle_t F; + void *buf; + + F = malloc (sizeof (_listz_handle_t)); + if (F == NULL) + return NULL; + + /* Find out how many file_word_t's m has */ + buf = (file_word_t *) mpz_export (NULL, &F->words, -1, sizeof(file_word_t), + -1, 0, m); + if (buf == NULL) + { + free (F); + return NULL; + } + free(buf); + + F->len = len; + F->storage = 0; /* Memory storage */ + F->data.mem = l; + + return F; +} + + +void +listz_handle_clear (listz_handle_t F) +{ + if (F->storage == 0) + { + clear_list (F->data.mem, F->len); + F->data.mem = NULL; + } + else + { + fclose (F->data.file); + F->data.file = NULL; + remove (F->filename); + free (F->filename); + } + free (F); +} + + +static inline void +export_residue (file_word_t *buf, const size_t bufsize, const mpz_t r) +{ + size_t nr; + + /* Export r to buf */ + mpz_export (buf, &nr, -1, sizeof(file_word_t), 0, 0, r); + ASSERT_ALWAYS (nr <= bufsize); + + /* Pad buf with zeroes */ + for ( ; nr < bufsize; nr++) + buf[nr] = 0; +} + + +static inline int +listz_handle_seek_entry (listz_handle_t F, const uint64_t index) +{ + int64_t foffset; + + ASSERT (F->storage == 1); + ASSERT (index <= INT64_MAX); + ASSERT (index <= INT64_MAX / sizeof(file_word_t) / F->words); + foffset = (int64_t) index * F->words * sizeof(file_word_t); + return aux_fseek64 (F->data.file, foffset, SEEK_SET); +} + + +/* Output a polynomial of degree len-1, or a monic polynomial of degree len. + In either case, len is the number of coefficients read from "l". + If symmetric == 1, then the polynomial is printed as a reciprocal Laurent + polynomial where the coefficients stored in l (and perhaps the leading + monomial) are in standard basis. */ + +void +listz_handle_output_poly (const listz_handle_t l, const uint64_t len, + const int monic, const int symmetric, + const char *prefix, const char *suffix, + const int verbosity) +{ + uint64_t i; + mpz_t m; + listz_iterator_t *iter; + + if (!test_verbose(verbosity)) + return; + + if (prefix != NULL) + outputf (verbosity, prefix); + + if (len == 0) + { + if (monic) + outputf (verbosity, "1\n"); + else + outputf (verbosity, "0\n"); + return; + } + + mpz_init (m); + iter = listz_iterator_init (l, 0); + for (i = 0; i < len; i++) + { + const uint64_t deg = len - ((monic != 0) ? 0U : 1U); + const char *plus = (i < deg) ? " + " : ""; + listz_iterator_read (iter, m); + if (symmetric) + outputf (verbosity, "Mod(%Zd,N) * (x^%" PRIu64 " + x^-%" PRIu64 ")%s", + m, i, i, plus); + else + outputf (verbosity, "Mod(%Zd,N) * x^%" PRIu64 "%s", m, i, plus); + } + if (monic) + { + if (symmetric) + outputf (verbosity, "(x^%" PRIu64 " + x^-%" PRIu64 ")", len, len); + else + outputf (verbosity, "x^%" PRIu64, len); + } + listz_iterator_clear (iter); + if (suffix != NULL) + outputf (verbosity, suffix); + mpz_clear (m); +} + + +/* Iterator functions for sequential access to elements of a + list_handle_t. */ + +static void +listz_iterator_fetch (listz_iterator_t *iter, const uint64_t offset) +{ + ASSERT (iter->handle->storage == 1); + iter->offset = offset; +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + iter->cb.aio_offset = (off_t) offset * iter->handle->words * sizeof(file_word_t); + iter->cb.aio_buf = iter->buf[iter->active_buffer]; + iter->cb.aio_nbytes = iter->bufsize * iter->handle->words * sizeof(file_word_t); + { + int r = aio_read (&iter->cb); + if (r != 0) + { + fprintf (stderr, "%s(): aio_read() returned %d\n", __func__, r); + abort (); + } + } +#else /* ifdef HAVE_AIO_READ */ +#ifdef _OPENMP +#pragma omp critical +#endif + { + listz_handle_seek_entry (iter->handle, iter->offset); + iter->valid = fread (iter->buf, iter->handle->words * sizeof(file_word_t), + iter->bufsize, iter->handle->data.file); + } +#endif /* ifdef HAVE_AIO_READ else */ +} + + +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) +static size_t +listz_iterator_suspend (struct aiocb * const cb) +{ + int r; + ssize_t s; + + do { + const struct aiocb * aiocb_list[1] = {cb}; + r = aio_suspend (aiocb_list, 1, NULL); + } while (r == EAGAIN || r == EINTR); + if (r != 0) + { + fprintf (stderr, "%s(): aio_suspend() returned error, errno = %d\n", + __func__, errno); + abort (); + } + + s = aio_return (cb); + if (s < 0) + { + fprintf (stderr, "%s(): aio_return() returned error code %ld\n", + __func__, (long int) s); + abort(); + } + return (size_t) s; +} +#endif + + +static void +listz_iterator_flush (listz_iterator_t *iter) +{ + + ASSERT (iter->handle->storage == 1); + if (iter->writeptr == 0) + return; + +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + { + size_t nbytes, written; + int r; + + iter->cb.aio_offset = + (off_t) iter->offset * iter->handle->words * sizeof(file_word_t); + iter->cb.aio_buf = iter->buf[iter->active_buffer]; + nbytes = iter->writeptr * iter->handle->words * sizeof(file_word_t); + iter->cb.aio_nbytes = nbytes; + r = aio_write (&iter->cb); + if (r != 0) + { + fprintf (stderr, "%s(): aio_write() returned error, errno = %d\n", + __func__, errno); + abort (); + } + written = listz_iterator_suspend (&iter->cb); + ASSERT_ALWAYS (written == nbytes); + } +#else +#ifdef _OPENMP +#pragma omp critical +#endif + { + size_t written; + listz_handle_seek_entry (iter->handle, iter->offset); + written = fwrite (iter->buf, sizeof(file_word_t) * iter->handle->words, + iter->writeptr, iter->handle->data.file); + ASSERT_ALWAYS (written == iter->writeptr); + } +#endif + iter->writeptr = 0; +} + + +listz_iterator_t * +listz_iterator_init2 (listz_handle_t h, const uint64_t firstres, + const size_t nr_buffered) +{ + listz_iterator_t *iter; + + iter = (listz_iterator_t *) malloc (sizeof(listz_iterator_t)); + if (iter == NULL) + return NULL; + iter->handle = h; + + if (iter->handle->storage == 0) + iter->readptr = iter->writeptr = (size_t) firstres; + else + { + iter->offset = firstres; + iter->readptr = iter->writeptr = iter->valid = 0; + iter->bufsize = nr_buffered; +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + iter->buf[0] = malloc (iter->bufsize * iter->handle->words * + sizeof(file_word_t)); + iter->buf[1] = malloc (iter->bufsize * iter->handle->words * + sizeof(file_word_t)); + if (iter->buf[0] == NULL || iter->buf[1] == NULL) + { + free (iter->buf[0]); + free (iter->buf[1]); + free (iter); + return NULL; + } +#ifdef _OPENMP +#pragma omp critical +#endif + { + /* Prevent other access to the file which would lead to data + corruption */ + iter->hidden_file = h->data.file; + h->data.file = NULL; + } + ASSERT_ALWAYS (iter->hidden_file != NULL); + iter->active_buffer = 0; + memset (&iter->cb, 0, sizeof(struct aiocb)); + iter->cb.aio_fildes = fileno (iter->hidden_file); + iter->cb.aio_reqprio = 0; + iter->cb.aio_sigevent.sigev_notify = SIGEV_NONE; +#else + iter->buf = malloc (iter->bufsize * iter->handle->words * + sizeof(file_word_t)); + if (iter->buf == NULL) + { + free (iter); + return NULL; + } +#endif + } + + return iter; +} + + +listz_iterator_t * +listz_iterator_init (listz_handle_t h, const uint64_t firstres) +{ + const size_t listz_iterator_nr_buffered = 1<<16; + return listz_iterator_init2 (h, firstres, listz_iterator_nr_buffered); +} + + +void +listz_iterator_clear (listz_iterator_t *iter) +{ + if (iter->handle->storage == 1) + { +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + if (iter->valid > 0) + { + listz_iterator_suspend (&iter->cb); + } +#endif + listz_iterator_flush (iter); +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + iter->handle->data.file = iter->hidden_file; + free (iter->buf[0]); + free (iter->buf[1]); +#else + free (iter->buf); +#endif + } + free (iter); +} + + +/* Outside of listz_iterator_newbuf() we have: + If iter->valid == 0, then there is no outstanding read request + If iter->valid != 0, then there is an outstanding read request for the + non-active buffer + There is no outstanding write request + If there is an outstanding read request, then iter->offset is the start + position in the file (in units of residues) of that read request +*/ + +static inline void +listz_iterator_switchbuf (listz_iterator_t *iter) +{ +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + if (iter->valid == 0) + { + /* We never read data before. We have to fetch data for the + current buffer and wait for it to finish, then issue a fetch + for the next buffer */ + listz_iterator_fetch (iter, iter->offset); + iter->active_buffer ^= 1; + } + iter->valid = listz_iterator_suspend (&iter->cb) + / sizeof(file_word_t) / iter->handle->words; +#endif + listz_iterator_flush (iter); + listz_iterator_fetch (iter, iter->offset + iter->valid); +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + iter->active_buffer ^= 1; +#endif + iter->readptr = 0; + ASSERT_ALWAYS (iter->valid > 0); +} + + +void +listz_iterator_read (listz_iterator_t *iter, mpz_t r) +{ + if (iter->handle->storage == 0) + { + mpz_set (r, iter->handle->data.mem[iter->readptr]); +#if defined(TRACE_ITER) + gmp_printf ("%s(): readptr = %" PRIu64 ", r = %Zd (in memory)\n", + __func__, (uint64_t) iter->readptr, r); +#endif + } + else + { + /* Try to detect incorrect use of iterator. We allow either read-only, + in which case we must have writeptr == 0 at all times, or sequential + update (read-then-write) of each residue, in which case we must have + writeptr == readptr here */ + ASSERT (iter->writeptr == 0 || iter->readptr == iter->writeptr); + + if (iter->readptr == iter->valid) + listz_iterator_switchbuf (iter); + +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + mpz_import (r, iter->handle->words, -1, sizeof(file_word_t), 0, 0, + &iter->buf[iter->active_buffer][iter->readptr * iter->handle->words]); +#else + mpz_import (r, iter->handle->words, -1, sizeof(file_word_t), 0, 0, + &iter->buf[iter->readptr * iter->handle->words]); +#endif +#if defined(TRACE_ITER) + gmp_printf ("%s(): offset = %" PRIu64 ", readptr = %" PRIu64 + ", r = %Zd (on disk)\n", + __func__, iter->offset, (uint64_t) iter->readptr, r); +#endif + } + iter->readptr++; +} + + +void +listz_iterator_write (listz_iterator_t *iter, const mpz_t r) +{ + if (iter->handle->storage == 0) + { +#if defined(TRACE_ITER) + gmp_printf ("%s(): writeptr = %"PRIu64", r = %Zd\n", + __func__, (uint64_t) iter->writeptr, r); +#endif + mpz_set (iter->handle->data.mem[iter->writeptr], r); + } + else + { + /* Try to detect incorrect use of iterator. We allow either write-only, + in which case we must have readptr == 0 at all times, or sequential + update (read-then-write) of each residue, in which case we must have + writeptr + 1 == readptr */ + ASSERT (iter->readptr == 0 || iter->writeptr + 1 == iter->readptr); + ASSERT (iter->writeptr <= iter->bufsize); +#if defined(TRACE_ITER) + gmp_printf ("%s(): offset = %"PRIu64", writeptr = %"PRIu64 + ", r = %Zd (on disk)\n", + __func__, iter->offset, (uint64_t) iter->writeptr, r); +#endif + if (iter->writeptr == iter->bufsize) + { +#if defined(TRACE_ITER) + printf ("%s(): flushing %"PRIu64" entries from buffer %d\n", + iter->writeptr, iter->active_buffer); +#endif + listz_iterator_flush (iter); + iter->offset += iter->bufsize; + } + ASSERT_ALWAYS (mpz_sgn (r) >= 0); + /* TODO: we may want to allow residues that are not fully reduced + (mod modulus), but only as far as the ECRT reduces them. */ +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + export_residue (&iter->buf[iter->active_buffer][iter->writeptr * iter->handle->words], + iter->handle->words, r); +#else + export_residue (&iter->buf[iter->writeptr * iter->handle->words], + iter->handle->words, r); +#endif + } + iter->writeptr++; +} + +/* Functions that can be used as callbacks to listz_iterator_read() and + listz_iterator_write() in mpzspv_fromto_mpzv(). Note that calling, e.g., + listz_iterator_read() by de-referencing a pointer of type mpz_producerfunc_t + leads to undefined program behavior according to the C standard, even though + it happens to work fine on x86[_64] architectures at least. On other + architectures, a function pointer may carry contextual information which + could be incorrect when de-referencing a function pointer of the wrong type. +*/ +void +listz_iterator_read_callback (void *iter, mpz_t r) +{ + listz_iterator_read ((listz_iterator_t *) iter, r); +} + +void +listz_iterator_write_callback (void *iter, const mpz_t r) +{ + listz_iterator_write ((listz_iterator_t *) iter, r); +} diff -Nru gmp-ecm-7.0.4+ds/listz_handle.h gmp-ecm-7.0.5+ds/listz_handle.h --- gmp-ecm-7.0.4+ds/listz_handle.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/listz_handle.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,67 @@ +#include +#ifdef HAVE_AIO_H +#include +#endif +#include "basicdefs.h" +#include "ecm-impl.h" + +/* Defining WANT_AIO makes iterators use a double buffer with aio_() functions + for file access. Unfortunately this does not seem to be faster than plain + fread()/fwrite(). */ +/* #define WANT_AIO 1 */ + +/* This type is the basis for file I/O of mpz_t */ +typedef unsigned long file_word_t; + +typedef struct { + int storage; /* memory = 0, file = 1 */ + uint64_t len; + size_t words; /* Number of file_word_t in a residue */ + union { + listz_t mem; + FILE *file; + } data; + char *filename; +} _listz_handle_t; +typedef _listz_handle_t *listz_handle_t; + + +/* The only permissible access modes for listz_iterator_*() functions are read-only, + write-only, and read-then-write to each residue in sequence. */ + +typedef struct{ + listz_handle_t handle; +#if defined(HAVE_AIO_READ) && defined(WANT_AIO) + struct aiocb cb; + file_word_t *buf[2]; + int active_buffer; + FILE *hidden_file; +#else + file_word_t *buf; +#endif + size_t bufsize; /* Size of buffer, in units of residues */ + uint64_t offset; /* First buffered element's offset relative to + start of file, in units of residues (handle->words * + sizeof(file_word_t)) */ + size_t valid; /* Number of valid residues in buffer */ + size_t readptr, writeptr; /* In unit of residues, relative to + current buf. If handle is stored in memory, + is index of the next mpz_t to read or write, + resp. */ +} listz_iterator_t; + +listz_handle_t listz_handle_init (const char *, uint64_t, const mpz_t); +listz_handle_t listz_handle_from_listz (listz_t, uint64_t, const mpz_t); +void listz_handle_clear (listz_handle_t); +void listz_handle_get (listz_handle_t, mpz_t, file_word_t *, uint64_t); +void listz_handle_get2 (listz_handle_t, mpz_t, uint64_t); +void listz_handle_set (listz_handle_t, const mpz_t, file_word_t *, uint64_t); +void listz_handle_output_poly (const listz_handle_t, uint64_t, int, int, const char *, const char *, int); + +listz_iterator_t *listz_iterator_init (listz_handle_t, uint64_t); +listz_iterator_t *listz_iterator_init2 (listz_handle_t, uint64_t, size_t); +void listz_iterator_clear (listz_iterator_t *); +void listz_iterator_read (listz_iterator_t *, mpz_t); +void listz_iterator_write (listz_iterator_t *, const mpz_t); +void listz_iterator_read_callback (void *, mpz_t); +void listz_iterator_write_callback (void *, const mpz_t); diff -Nru gmp-ecm-7.0.4+ds/m4/valgrind-tests.m4 gmp-ecm-7.0.5+ds/m4/valgrind-tests.m4 --- gmp-ecm-7.0.4+ds/m4/valgrind-tests.m4 2016-06-28 06:12:04.000000000 +0000 +++ gmp-ecm-7.0.5+ds/m4/valgrind-tests.m4 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -# gl_VALGRIND_TESTS() -# ------------------- -# Check if valgrind is available, and set VALGRIND to it if available. -AC_DEFUN([gl_VALGRIND_TESTS], -[ - AC_ARG_ENABLE(valgrind-tests, - AS_HELP_STRING([--enable-valgrind-tests], - [run self tests under valgrind]), - [opt_valgrind_tests=$enableval], [opt_valgrind_tests=no]) - # Run self-tests under valgrind? - if test "$opt_valgrind_tests" = "yes" && test "$cross_compiling" = no; then - AC_CHECK_PROGS(VALGRIND, valgrind) - fi - OPTS="-q --error-exitcode=1 --leak-check=full" - if test -n "$VALGRIND" \ - && $VALGRIND $OPTS $SHELL -c 'exit 0' > /dev/null 2>&1; then - opt_valgrind_tests=yes - VALGRIND="$VALGRIND $OPTS" - else - opt_valgrind_tests=no - VALGRIND= - fi - AC_MSG_CHECKING([whether self tests are run under valgrind]) - AC_MSG_RESULT($opt_valgrind_tests) -]) diff -Nru gmp-ecm-7.0.4+ds/main.c gmp-ecm-7.0.5+ds/main.c --- gmp-ecm-7.0.4+ds/main.c 2016-08-22 11:05:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/main.c 2022-06-06 14:16:49.000000000 +0000 @@ -30,6 +30,8 @@ #include "ecm-impl.h" #include "ecm-ecm.h" +#include "config.h" + #ifdef HAVE_UNISTD_H /* for access() */ # include #else @@ -55,6 +57,7 @@ #include "torsions.h" /* to benefit from more torsion groups */ #endif + /* #define DEBUG */ static int exit_asap_value = 0; @@ -118,7 +121,7 @@ printf (" -save file save residues at end of stage 1 to file\n"); printf (" -savea file like -save, appends to existing files\n"); printf (" -resume file resume residues from file, reads from stdin if file is \"-\"\n"); - printf (" -chkpnt file save periodic checkpoints during stage 1 to file\n"); + printf (" -chkpnt file save periodic checkpoints during stage 1 to file (for -param 0)\n"); printf (" -primetest perform a primality test on input\n"); printf (" -treefile f [ECM only] store stage 2 data in files f.0, ... \n"); printf (" -maxmem n use at most n MB of memory in stage 2\n"); @@ -135,11 +138,14 @@ printf (" -bloads file With -param 1-3, load stage 1 exponent from file.\n"); #ifdef WITH_GPU printf (" -gpu Use GPU-ECM for stage 1.\n"); +#if HAVE_CGBN_H + printf (" -cgbn Use CGBN for GPU-ECM stage 1 computation.\n"); +#endif /* HAVE_CGBN_H */ printf (" -gpudevice n Use device n to execute GPU code (by default, " "CUDA chooses)\n"); printf (" -gpucurves n Compute on n curves in parallel on the GPU (by " "default, CUDA chooses)\n"); -#endif +#endif /* WITH_GPU */ printf (" -h, --help Prints this help and exit.\n"); } @@ -350,6 +356,7 @@ mpcandi_t n; mpgocandi_t go; mpq_t rat_x0, rat_y0, rat_A; + mpz_t numer_A, denom_A; /* used in Hessian stuff */ double B1, B1done; int result, returncode = 0; int verbose = OUTPUT_NORMAL; /* verbose level */ @@ -397,7 +404,8 @@ unsigned long gw_n = 0; /* set default values for gwnum poly k*b^n+c */ signed long gw_c = 0; /* set default values for gwnum poly k*b^n+c */ #endif - int use_gpu = 0; /* Do we use the GPU for stage 1 (by default no)*/ + int use_gpu = 0; /* Do we use the GPU for stage 1 (by default no) */ + int gpucgbn = 0; /* Do we use CGBN for stage 1 GPU computation (by default no) */ int gpudevice = -1; /* Which device do we use for GPU code (by default CUDA */ /* chooses) */ unsigned int gpucurves = 0; /* How many curves do we want for GPU code */ @@ -783,8 +791,19 @@ else if (strcmp (argv[1], "-gpu") == 0) { use_gpu = 1; - argv++; - argc--; + argv++; + argc--; + } + else if (strcmp (argv[1], "-cgbn") == 0) + { + use_gpu = 1; + gpucgbn = 1; +#ifndef HAVE_CGBN_H + fprintf (stderr, "CGBN not present; configure with --with-cgbn-include\n"); + exit (EXIT_FAILURE); +#endif /* !HAVE_CGBN_H */ + argv++; + argc--; } else if ((argc > 2) && (strcmp (argv[1], "-gpudevice") == 0)) { @@ -806,10 +825,10 @@ } } - /* check that S is even for old P-1 stage 2 */ - if ((method == ECM_PM1) && (S != ECM_DEFAULT_S) && (S % 2 != 0)) + if ((method == ECM_PM1 || method == ECM_PP1) && (S != ECM_DEFAULT_S && S != 1)) { - fprintf (stderr, "Error, S should be even for P-1\n"); + fprintf (stderr, "Error, %s not supported for %s\n", + S < 0 ? "-dickson" : "-power", method == ECM_PM1 ? "-pm1" : "-pp1" ); exit (EXIT_FAILURE); } @@ -1026,7 +1045,8 @@ params->TreeFilename = TreeFilename; params->maxmem = maxmem; params->stage1time = stage1time; - params->gpu = use_gpu; /* If WITH_GPU is not defined it will always be 0 */ + params->gpu = use_gpu; /* If WITH_GPU is not defined it will always be 0 */ + params->gpu_cgbn = gpucgbn; /* If !HAVE_CGBN_H will always be 0 */ params->gpu_device = gpudevice; /* If WITH_GPU is not defined or */ /* use_gpu = 0, it has no meaning */ params->gpu_number_of_curves = gpucurves; /* If WITH_GPU is not defined or */ @@ -1153,7 +1173,8 @@ break; if (params->E->type == ECM_EC_TYPE_WEIERSTRASS - || params->E->type == ECM_EC_TYPE_HESSIAN) + || params->E->type == ECM_EC_TYPE_HESSIAN + || params->E->type == ECM_EC_TYPE_TWISTED_HESSIAN) params->sigma_is_A = -1; else params->sigma_is_A = mpz_sgn (sigma) == 0; /* sure? */ @@ -1237,7 +1258,18 @@ /* Set effective seed for factoring attempt on this number */ if (specific_A) { - returncode = mod_from_mpq (A, rat_A, n.n, verbose); + if (param != ECM_PARAM_TWISTED_HESSIAN) + { + returncode = mod_from_mpq (A, rat_A, n.n, verbose); + } + else + { + mpz_init(numer_A); + mpz_init(denom_A); + mpz_mod(numer_A, mpq_numref(rat_A), n.n); + mpz_mod(denom_A, mpq_denref(rat_A), n.n); + returncode = ECM_NO_FACTOR_FOUND; + } if (returncode != ECM_NO_FACTOR_FOUND) goto free_all1; } @@ -1324,7 +1356,8 @@ } else if (param != ECM_PARAM_DEFAULT && !IS_BATCH_MODE (param) && param != ECM_PARAM_SUYAMA && param != ECM_PARAM_WEIERSTRASS - && param != ECM_PARAM_HESSIAN) + && param != ECM_PARAM_HESSIAN + && param != ECM_PARAM_TWISTED_HESSIAN) { fprintf (stderr, "Error, invalid -param value: %d\n", param); exit (EXIT_FAILURE); @@ -1336,12 +1369,27 @@ /* this is a hack to produce an error in ecm() when -bsaves is used but we are not in batch mode */ if (savefile_s != NULL) - mpz_set_ui (params->batch_s, 2); + { + if (!IS_BATCH_MODE(param)) + { + fprintf (stderr, "Error, -bsaves makes sense in batch mode only\n"); + exit (EXIT_FAILURE); + } + /* if batch_s <> 1, it means it was already initialized, + thus don't discard it, for example with -c 2 */ + if (mpz_cmp_ui (params->batch_s, 1) == 0) /* not initialized */ + mpz_set_ui (params->batch_s, 2); + } /* load batch product s from a file */ if (loadfile_s != NULL) { int st = cputime (); + if (!IS_BATCH_MODE(param)) + { + fprintf (stderr, "Error, -bloads makes sense in batch mode only\n"); + exit (EXIT_FAILURE); + } params->batch_last_B1_used = B1; if (read_s_from_file (params->batch_s, loadfile_s, B1)) { @@ -1351,8 +1399,8 @@ else if (verbose >= OUTPUT_VERBOSE) fprintf (stdout, "Reading batch product (of %"PRIu64" bits) of " "primes up to B1=%1.0f from %s took %ldms\n", - mpz_sizeinbase (params->batch_s, 2), B1, - loadfile_s, cputime () - st); + (uint64_t) mpz_sizeinbase (params->batch_s, 2), B1, + loadfile_s, cputime () - st); } /* set parameters that may change from one curve to another */ @@ -1382,11 +1430,23 @@ } else if (params->param == ECM_PARAM_HESSIAN) { + /* use x^3+y^3+1=3*A*x*y */ mpz_set (params->E->a4, A); params->sigma_is_A = -1; params->E->type = ECM_EC_TYPE_HESSIAN; params->E->law = ECM_LAW_HOMOGENEOUS; } + else if (params->param == ECM_PARAM_TWISTED_HESSIAN) + { + /* use a*x^3+y^3+1=d*x*y with A=a^3/d -- trick! */ + mpz_set (params->E->a4, numer_A); + mpz_set (params->E->a6, denom_A); + mpz_clear(numer_A); + mpz_clear(denom_A); + params->sigma_is_A = -1; + params->E->type = ECM_EC_TYPE_TWISTED_HESSIAN; + params->E->law = ECM_LAW_HOMOGENEOUS; + } } #ifdef HAVE_TORSION else if (torsion != NULL) @@ -1454,8 +1514,8 @@ /* now call the ecm library */ if (result == ECM_NO_FACTOR_FOUND) - /* if torsion was used, some factor may have been found... */ - result = ecm_factor (f, n.n, B1, params); + /* if torsion was used, some factor may have been found... */ + result = ecm_factor (f, n.n, B1, params); if (result == ECM_ERROR) { @@ -1486,6 +1546,7 @@ do { if (params->gpu) + /* gpu returns multiple factors as f = f0 + f1*n + ... + fk*n^k */ mpz_fdiv_qr (f, tmp_factor, f, tmp_n); else mpz_set (tmp_factor, f); diff -Nru gmp-ecm-7.0.4+ds/Makefile.am gmp-ecm-7.0.5+ds/Makefile.am --- gmp-ecm-7.0.4+ds/Makefile.am 2016-08-23 12:25:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/Makefile.am 2022-06-06 14:16:49.000000000 +0000 @@ -43,7 +43,7 @@ random.c factor.c sp.c spv.c spm.c mpzspm.c mpzspv.c \ ntt_gfp.c ecm_ntt.c pm1fs2.c sets_long.c \ auxarith.c batch.c parametrizations.c cudawrapper.c \ - aprtcle/mpz_aprcl.c + aprtcle/mpz_aprcl.c addlaws.c torsions.c # Link the asm redc code (if we use it) into libecm.la libecm_la_CPPFLAGS = $(MULREDCINCPATH) libecm_la_CFLAGS = $(OPENMP_CFLAGS) -g @@ -53,7 +53,10 @@ libecm_la_LDFLAGS = $(LIBECM_LDFLAGS) -version-info 1:0:0 -g libecm_la_LIBADD = $(MULREDCLIBRARY) if WANT_GPU - libecm_la_SOURCES += cudakernel.cu + libecm_la_SOURCES += cudakernel.cu cudacommon.cu +if WANT_CGBN + libecm_la_SOURCES += cgbn_stage1.cu +endif libecm_la_LIBADD += $(CUDALIB) libecm_la_LDFLAGS += $(CUDALDFLAGS) ecm_LDFLAGS = $(CUDARPATH) @@ -69,6 +72,7 @@ ecm_CPPFLAGS = -DOUTSIDE_LIBECM ecm_CFLAGS = $(OPENMP_CFLAGS) -g ecm_SOURCES = auxi.c b1_ainc.c candi.c eval.c main.c resume.c \ + addlaws.c torsions.c \ getprime_r.c champions.h aprtcle/mpz_aprcl.c memusage.c tune_SOURCES = mpmod.c tune.c mul_lo.c listz.c auxlib.c ks-multiply.c \ @@ -90,7 +94,7 @@ rho_SOURCES = rho.c rho_CPPFLAGS = -DTESTDRIVE -rho_LDADD = -lprimegen $(GMPLIB) $(GSL_LD_FLAGS) +rho_LDADD = -lprimesieve -lgsl $(GMPLIB) if WITH_GWNUM gwdata.ld : @@ -100,14 +104,16 @@ # Use ecm_DEPENDENCIES += gwdata.ld instead? Is that possible? ecm_DEPENDENCIES = gwdata.ld ecm_LDFLAGS = $(AM_LDFLAGS) -Wl,gwdata.ld + LIBS += -ldl Fgwtest : Fgw.c gwdata.ld $(CC) $(CFLAGS) $(CPPFLAGS) -g -DTESTDRIVE -Wl,gwdata.ld -o Fgwtest Fgw.c libecm.a $(LIBS) endif include_HEADERS = ecm.h noinst_HEADERS = basicdefs.h ecm-impl.h ecm-gmp.h ecm-ecm.h sp.h longlong.h \ - ecm-params.h mpmod.h ecm-gpu.h cudakernel.h addlaws.h \ - getprime_r.h ecm_int.h \ + ecm-params.h mpmod.h ecm-gpu.h torsions.h \ + cudakernel.h cudacommon.h cgbn_stage1.h \ + addlaws.h getprime_r.h ecm_int.h \ aprtcle/mpz_aprcl.h aprtcle/jacobi_sum.h EXTRA_DIST = test.pm1 test.pp1 test.ecm README.lib INSTALL-ecm ecm.xml \ @@ -116,8 +122,9 @@ powerpc64/params.h powerpc32/params.h sparc64/params.h \ hppa/params.h mips/params.h x86_64/corei7/params.h \ generic/params.h testlong.pp1 testlong.pm1 testlong.ecm \ - getprime_r.h cudakernel_default.cu test.gpuecm README.gpu c155 \ - test_dummy2.save test_dummy.save test_prime95.save M877.save \ + getprime_r.h cudakernel_default.cu cgbn_stage1.cu \ + test.gpuecm README.gpu c155 \ + test_dummy.save test_prime95.save M877.save \ M997.save test_Z2102.n DIST_SUBDIRS = athlon pentium4 x86_64 powerpc64 aprtcle build.vc12 @@ -125,8 +132,8 @@ DISTCLEANFILES = config.m4 if WANT_GPU -.cu.lo: cudakernel.h - $(LIBTOOL) --tag=CC --mode=compile $(NVCC) --compile $(NVCCFLAGS) $(CPPFLAGS) -o $@ $^ -static +.cu.lo: cudakernel.h cgbn_stage1.h + $(LIBTOOL) --tag=CC --mode=compile $(NVCC) --compile $(NVCCFLAGS) --compiler-options -fPIC $(CPPFLAGS) -o $@ $^ -static endif ecm-params: bench_mulredc$(EXEEXT) tune$(EXEEXT) @@ -141,6 +148,9 @@ if WANT_GPU dist_check_SCRIPTS += test.gpuecm endif +if WANT_CGBN +dist_check_SCRIPTS += test.cgbnecm +endif TESTS = $(dist_check_SCRIPTS) TESTS_ENVIRONMENT = $(VALGRIND) @@ -166,6 +176,12 @@ $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -redc" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -mpzmod" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -treefile tree" +if WANT_GPU + $(srcdir)/test.gpuecm "$(VALGRIND) ./ecm$(EXEEXT)" +endif +if WANT_CGBN + $(srcdir)/test.cgbnecm "$(VALGRIND) ./ecm$(EXEEXT)" +endif $(srcdir)/testlong.pp1 "$(VALGRIND) ./ecm$(EXEEXT)" $(srcdir)/testlong.pm1 "$(VALGRIND) ./ecm$(EXEEXT)" $(srcdir)/testlong.ecm "$(VALGRIND) ./ecm$(EXEEXT)" diff -Nru gmp-ecm-7.0.4+ds/makesmooth.gp gmp-ecm-7.0.5+ds/makesmooth.gp --- gmp-ecm-7.0.4+ds/makesmooth.gp 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/makesmooth.gp 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,30 @@ +/* Simple PARI script to make test numbers for P-1 and P+1. For each prime q + in [B1, B2] it prints the n*p, where p is the smallest prime of the + form k*q+c. I.e. run with + echo "makesmooth(1000000, 1001000, 1, 1)" | gp -p 1001000 -q makesmooth | ecm -pm1 100 1001000 + and check that P-1 stage 2 finds all the input numbers as factors. + + To test P+1 properly, a suitable x0 must be generated. The makesmooth_x0() + function prints it along with the prime. The output can be used like + echo "makesmooth_x0(1000000, 1001000, -1, 1000000000000273)" | gp -p 1001000 -q makesmooth | while read N X0; do echo $N | ./ecm -pp1 -x0 $X0 1000 1001000; done + Alternatively, the function makesmooth_fixed_x0() can be used which outputs + only primes so that x0^2-4 is a quadratic non-residue, i.e. by + echo "makesmooth_fixed_x0(1000000, 1001000, -1, 3, 1000000000000273)" | gp -p 1001000 -q makesmooth | ecm -pm1 -x0 3 100 1001000 + will make sure that for all produced primes, 3^2-4 = 5 is a QNR and that + GMP-ECM with -x0 3 parameter will work properly. + + The parameter n can be used to test that the code finds actual factors, + not just the input number. + 1000000000000273 +- 1 has relatively large prime factors, making it useful + as a value for n. */ + +/* Find the smallest prime of the form p = q*i+c */ +makesmooth_one (q, c) = {local(i); i = 1; while (!isprime(q*i+c), i++); return(q*i+c)} + +/* Find the smallest prime of the form p = q*i+c so that D is a QNR (mod p) */ +makesmooth_one_D (q, c, D) = {local(p); p=q+c; while (!isprime(p) || kronecker(D, p) != -1, p+=q); return(p)} + +find_x0 (p) = {local(i); i = 3; while (kronecker(i^2-4, p) == 1, i++); return(i);} +makesmooth(B1, B2, c, n) = {local(i, q); q = nextprime(B1); while (q <= B2, p = makesmooth_one (q, c); print(p * max(n,1)); q = nextprime(q + 1))} +makesmooth_x0(B1, B2, c, n) = {local(i, q); q = nextprime(B1); while (q <= B2, p = makesmooth_one (q, c); print(p * max(n,1), " ", find_x0(p)); q = nextprime(q + 1))} +makesmooth_fixed_x0(B1, B2, c, x0, n) = {local(i, q); q = nextprime(B1); while (q <= B2, p = makesmooth_one_D (q, c, x0^2-4); print(p * max(n,1)); q = nextprime(q + 1))} diff -Nru gmp-ecm-7.0.4+ds/mpmod.c gmp-ecm-7.0.5+ds/mpmod.c --- gmp-ecm-7.0.4+ds/mpmod.c 2016-09-06 07:39:34.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mpmod.c 2022-06-06 14:16:49.000000000 +0000 @@ -762,6 +762,9 @@ break; } /* else go through */ +#if defined( __GNUC__ ) && __GNUC__ >= 7 && !defined(__ICC) + __attribute__ ((fallthrough)); +#endif case ECM_MOD_NOBASE2: if (mpz_size (N) < MPZMOD_THRESHOLD) repr = ECM_MOD_MODMULN; diff -Nru gmp-ecm-7.0.4+ds/mpzspm.c gmp-ecm-7.0.5+ds/mpzspm.c --- gmp-ecm-7.0.4+ds/mpzspm.c 2016-02-24 16:18:25.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mpzspm.c 2022-06-06 14:16:49.000000000 +0000 @@ -24,7 +24,6 @@ #include "sp.h" #include "ecm-impl.h" - /* Tables for the maximum possible modulus (in bit size) for different transform lengths l. The modulus is limited by the condition that primes must be diff -Nru gmp-ecm-7.0.4+ds/mpzspv.c gmp-ecm-7.0.5+ds/mpzspv.c --- gmp-ecm-7.0.4+ds/mpzspv.c 2016-03-08 12:06:46.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mpzspv.c 2022-06-06 14:16:49.000000000 +0000 @@ -259,7 +259,9 @@ } /* convert mpzvi to CRT representation, fast version, assumes - mpzspm->T has been precomputed (see mpzspm.c) */ + mpzspm->T has been precomputed (see mpzspm.c). + Warning: this function should be thread-safe, since it might be called + simultaneously by several threads. */ static void mpzspv_from_mpzv_fast (mpzspv_t x, const spv_size_t offset, mpz_t mpzvi, mpzspm_t mpzspm) @@ -267,36 +269,43 @@ const unsigned int sp_num = mpzspm->sp_num; unsigned int i, j, k, i0 = I0_THRESHOLD, I0; mpzv_t *T = mpzspm->T; + mpz_t *U; unsigned int d = mpzspm->d, ni; + U = malloc (sp_num * sizeof (mpz_t)); + for (j = 0; j < sp_num; j++) + mpz_init (U[j]); + ASSERT (d > i0); - /* T[0] serves as vector of temporary mpz_t's, since it contains the small - primes, which are also in mpzspm->spm[j]->sp */ /* initially we split mpzvi in two */ ni = 1 << (d - 1); - mpz_mod (T[0][0], mpzvi, T[d-1][0]); - mpz_mod (T[0][ni], mpzvi, T[d-1][1]); + mpz_mod (U[0], mpzvi, T[d-1][0]); + mpz_mod (U[ni], mpzvi, T[d-1][1]); for (i = d-1; i-- > i0;) { /* goes down from depth i+1 to i */ ni = 1 << i; for (j = k = 0; j + ni < sp_num; j += 2*ni, k += 2) { - mpz_mod (T[0][j+ni], T[0][j], T[i][k+1]); - mpz_mod (T[0][j], T[0][j], T[i][k]); + mpz_mod (U[j+ni], U[j], T[i][k+1]); + mpz_mod (U[j], U[j], T[i][k]); } - /* for the last entry T[0][j] if j < sp_num, there is nothing to do */ + /* for the last entry U[j] if j < sp_num, there is nothing to do */ } /* last steps */ I0 = 1 << i0; for (j = 0; j < sp_num; j += I0) { for (k = j; k < j + I0 && k < sp_num; k++) - x[k][offset] = mpn_mod_1 (PTR(T[0][j]), SIZ(T[0][j]), + x[k][offset] = mpn_mod_1 (PTR(U[j]), SIZ(U[j]), (mp_limb_t) mpzspm->spm[k]->sp); } /* The typecast to mp_limb_t assumes that mp_limb_t is at least as wide as sp_t */ + + for (j = 0; j < sp_num; j++) + mpz_clear (U[j]); + free(U); } #if defined(TRACE_mpzspv_from_mpzv) || defined(TRACE_ntt_sqr_reciprocal) diff -Nru gmp-ecm-7.0.4+ds/mul_fft-params.h.athlon64 gmp-ecm-7.0.5+ds/mul_fft-params.h.athlon64 --- gmp-ecm-7.0.4+ds/mul_fft-params.h.athlon64 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mul_fft-params.h.athlon64 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,5 @@ +#define MUL_FFT_MODF_THRESHOLD 300 +#define SQR_FFT_MODF_THRESHOLD 568 +#define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {401, 5 /*96*/}, {417, 4 /*98*/}, {433, 5 /*96*/}, {865, 6 /*96*/}, {897, 5 /*98*/}, {929, 6 /*96*/}, {2113, 7 /*97*/}, {2177, 6 /*98*/}, {2241, 7 /*97*/}, {2305, 6 /*98*/}, {2369, 7 /*97*/}, {3713, 8 /*93*/}, {3841, 7 /*98*/}, {4225, 8 /*94*/}, {4353, 7 /*98*/}, {4481, 8 /*94*/}, {4865, 7 /*98*/}, {4993, 8 /*95*/}, {6913, 9 /*87*/}, {7169, 8 /*96*/}, {7425, 9 /*93*/}, {7681, 8 /*96*/}, {8449, 9 /*94*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*90*/}, {11777, 8 /*97*/}, {12033, 9 /*92*/}, {13825, 10 /*87*/}, {14337, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {20993, 10 /*87*/}, {21505, 9 /*97*/}, {22017, 10 /*91*/}, {23553, 9 /*97*/}, {26113, 10 /*92*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {44033, 11 /*91*/}, {47105, 10 /*97*/}, {56321, 11 /*87*/}, {63489, 10 /*98*/}, {70657, 11 /*87*/}, {71681, 10 /*98*/}, {72705, 11 /*90*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {81921, 10 /*96*/}, {82945, 11 /*85*/}, {96257, 10 /*98*/}, {97281, 12 /*75*/}, {98305, 10 /*97*/}, {101377, 12 /*78*/}, {102401, 11 /*91*/}, {110593, 12 /*87*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 11 /*98*/}, {194561, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 11 /*99*/}, {276481, 12 /*85*/}, {282625, 11 /*96*/}, {284673, 12 /*87*/}, {389121, 11 /*99*/}, {391169, 13 /*75*/}, {434177, 12 /*95*/}, {438273, 13 /*84*/}, {516097, 12 /*99*/}, {585729, 11 /*99*/}, {620545, 13 /*79*/}, {630785, 12 /*96*/}, {651265, 13 /*83*/}, {778241, 12 /*99*/}, {782337, 11 /*99*/}, {817153, 12 /*96*/}, {819201, 14 /*79*/}, {1032193, 13 /*99*/}, {1040385, 11 /*99*/}, {1046529, 12 /*94*/}, {LONG_MAX, 0}} +#define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {385, 4 /*98*/}, {401, 5 /*96*/}, {801, 6 /*96*/}, {833, 5 /*98*/}, {865, 6 /*96*/}, {1729, 7 /*96*/}, {1793, 6 /*98*/}, {1857, 7 /*96*/}, {2049, 6 /*98*/}, {2113, 7 /*97*/}, {3841, 8 /*96*/}, {4097, 7 /*98*/}, {4225, 8 /*97*/}, {4609, 7 /*98*/}, {4737, 8 /*97*/}, {7169, 9 /*93*/}, {7681, 8 /*98*/}, {8449, 9 /*94*/}, {8705, 8 /*98*/}, {8961, 9 /*94*/}, {9217, 8 /*98*/}, {9473, 9 /*95*/}, {14849, 10 /*93*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {20481, 10 /*95*/}, {21505, 9 /*97*/}, {22017, 10 /*91*/}, {23553, 9 /*97*/}, {24065, 10 /*92*/}, {29697, 11 /*93*/}, {30721, 10 /*96*/}, {37889, 11 /*95*/}, {38913, 10 /*97*/}, {44033, 11 /*91*/}, {47105, 10 /*97*/}, {52225, 11 /*92*/}, {55297, 10 /*98*/}, {56321, 11 /*87*/}, {63489, 10 /*98*/}, {64513, 11 /*88*/}, {79873, 12 /*83*/}, {81921, 11 /*93*/}, {88065, 12 /*91*/}, {94209, 11 /*97*/}, {104449, 12 /*81*/}, {110593, 11 /*98*/}, {112641, 12 /*87*/}, {126977, 11 /*98*/}, {137217, 12 /*85*/}, {159745, 11 /*98*/}, {161793, 12 /*83*/}, {167937, 11 /*98*/}, {169985, 12 /*87*/}, {192513, 11 /*98*/}, {194561, 12 /*85*/}, {196609, 11 /*97*/}, {202753, 12 /*89*/}, {217089, 13 /*84*/}, {221185, 12 /*98*/}, {225281, 13 /*87*/}, {253953, 12 /*98*/}, {323585, 13 /*83*/}, {385025, 12 /*98*/}, {389121, 14 /*75*/}, {393217, 12 /*93*/}, {405505, 14 /*78*/}, {507905, 13 /*98*/}, {516097, 12 /*99*/}, {552961, 13 /*85*/}, {573441, 12 /*97*/}, {577537, 13 /*88*/}, {778241, 12 /*99*/}, {782337, 13 /*85*/}, {851969, 14 /*82*/}, {868353, 13 /*95*/}, {909313, 14 /*87*/}, {1032193, 13 /*99*/}, {LONG_MAX, 0}} +#define MUL_FFT_FULL_TABLE2 {{16, 1}, {4224, 2}, {4416, 6}, {4480, 2}, {4608, 4}, {4640, 2}, {4800, 1}, {5120, 2}, {5184, 1}, {5632, 2}, {5760, 1}, {6656, 4}, {6720, 1}, {7168, 4}, {7360, 1}, {7936, 4}, {8000, 2}, {8064, 1}, {8704, 2}, {8832, 6}, {8960, 3}, {9216, 1}, {13312, 6}, {14336, 3}, {15360, 5}, {16896, 6}, {17920, 1}, {19968, 2}, {20736, 1}, {21504, 2}, {23808, 1}, {28672, 4}, {29440, 2}, {29952, 1}, {33792, 2}, {35328, 1}, {36864, 4}, {37120, 1}, {49152, 4}, {49920, 1}, {50176, 3}, {53248, 1}, {55296, 2}, {59904, 3}, {61440, 1}, {65536, 2}, {70656, 6}, {71680, 2}, {72192, 5}, {73728, 4}, {79360, 1}, {81920, 2}, {82944, 1}, {86016, 2}, {89088, 1}, {90112, 2}, {95232, 1}, {100352, 5}, {110592, 1}, {114688, 4}, {117760, 1}, {131072, 2}, {144384, 5}, {147456, 4}, {158720, 1}, {161792, 3}, {163840, 2}, {190464, 1}, {196608, 4}, {199680, 3}, {212992, 1}, {262144, 6}, {272384, 7}, {294912, 6}, {301056, 4}, {322560, 1}, {327680, 3}, {344064, 2}, {380928, 1}, {385024, 2}, {387072, 1}, {393216, 7}, {425984, 6}, {444416, 5}, {466944, 1}, {520192, 2}, {577536, 7}, {589824, 6}, {602112, 4}, {645120, 3}, {688128, 2}, {774144, 1}, {786432, 6}, {788480, 4}, {808960, 5}, {811008, 2}, {817152, 3}, {819200, 5}, {823296, 2}, {829440, 1}, {1048576, 2}, {1069056, 1}, {1073152, 5}, {1081344, 3}, {1089536, 2}, {LONG_MAX, 1}} diff -Nru gmp-ecm-7.0.4+ds/mul_fft-params.h.default gmp-ecm-7.0.5+ds/mul_fft-params.h.default --- gmp-ecm-7.0.4+ds/mul_fft-params.h.default 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mul_fft-params.h.default 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,2 @@ +/* Empty file so that #include won't produce an error message. + With no parameters defined, mul_fft.c will use defaults. */ diff -Nru gmp-ecm-7.0.4+ds/mul_fft-params.h.pentium3 gmp-ecm-7.0.5+ds/mul_fft-params.h.pentium3 --- gmp-ecm-7.0.4+ds/mul_fft-params.h.pentium3 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mul_fft-params.h.pentium3 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,5 @@ +#define MUL_FFT_MODF_THRESHOLD 480 +#define SQR_FFT_MODF_THRESHOLD 480 +#define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {305, 5 /*95*/}, {321, 4 /*97*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {801, 6 /*96*/}, {1281, 7 /*91*/}, {1409, 6 /*97*/}, {1601, 7 /*92*/}, {1921, 6 /*98*/}, {1985, 7 /*94*/}, {2689, 8 /*91*/}, {2817, 7 /*95*/}, {3201, 8 /*92*/}, {3329, 7 /*96*/}, {3457, 8 /*87*/}, {3841, 7 /*96*/}, {3969, 8 /*88*/}, {4865, 7 /*97*/}, {4993, 8 /*90*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*83*/}, {11777, 8 /*97*/}, {12033, 9 /*85*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {15873, 8 /*98*/}, {16129, 9 /*88*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {26113, 10 /*81*/}, {31745, 9 /*98*/}, {34305, 10 /*85*/}, {39937, 9 /*98*/}, {40449, 10 /*83*/}, {48129, 11 /*75*/}, {63489, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {129025, 9 /*98*/}, {130561, 11 /*80*/}, {194561, 12 /*75*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 9 /*99*/}, {278017, 10 /*94*/}, {293889, 9 /*99*/}, {294401, 7 /*99*/}, {294529, 8 /*99*/}, {294657, 10 /*94*/}, {310273, 9 /*99*/}, {310785, 10 /*95*/}, {326657, 12 /*83*/}, {389121, 13 /*75*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {662529, 11 /*96*/}, {686081, 10 /*99*/}, {687105, 9 /*99*/}, {687617, 11 /*95*/}, {718849, 10 /*99*/}, {752641, 9 /*99*/}, {753153, 11 /*95*/}, {784385, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {980993, 10 /*99*/}, {982017, 12 /*93*/}, {LONG_MAX, 0}} +#define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {273, 5 /*94*/}, {289, 4 /*97*/}, {305, 5 /*95*/}, {609, 6 /*95*/}, {641, 5 /*97*/}, {673, 6 /*95*/}, {705, 5 /*97*/}, {737, 6 /*96*/}, {1473, 7 /*96*/}, {1537, 6 /*98*/}, {1601, 7 /*96*/}, {1665, 6 /*98*/}, {1729, 7 /*96*/}, {2689, 8 /*91*/}, {2817, 7 /*97*/}, {2945, 8 /*92*/}, {3329, 7 /*98*/}, {3457, 8 /*93*/}, {5377, 9 /*91*/}, {5633, 8 /*95*/}, {6401, 9 /*92*/}, {6657, 8 /*96*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {7937, 9 /*88*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {24065, 10 /*85*/}, {27649, 11 /*87*/}, {30721, 10 /*96*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {39937, 11 /*83*/}, {47105, 10 /*97*/}, {48129, 12 /*75*/}, {61441, 11 /*96*/}, {63489, 10 /*98*/}, {68609, 11 /*85*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 12 /*85*/}, {323585, 10 /*99*/}, {326657, 9 /*99*/}, {327169, 10 /*95*/}, {330753, 12 /*84*/}, {389121, 10 /*99*/}, {392193, 9 /*99*/}, {392705, 10 /*96*/}, {408577, 9 /*99*/}, {409089, 8 /*99*/}, {409345, 10 /*96*/}, {412673, 12 /*90*/}, {454657, 13 /*87*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {555009, 10 /*99*/}, {556033, 9 /*99*/}, {556545, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {654337, 11 /*95*/}, {686081, 13 /*87*/}, {778241, 11 /*99*/}, {817153, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {915457, 12 /*93*/}, {978945, 14 /*93*/}, {LONG_MAX, 0}} +#define MUL_FFT_FULL_TABLE2 {{100, 2}, {216, 1}, {256, 2}, {264, 1}, {304, 2}, {312, 1}, {544, 4}, {560, 1}, {704, 2}, {720, 1}, {896, 2}, {960, 7}, {40960, 2}, {47616, 1}, {49152, 6}, {53760, 4}, {56320, 1}, {64512, 4}, {71680, 5}, {86016, 2}, {96768, 4}, {99840, 1}, {131072, 6}, {136192, 7}, {147456, 6}, {150528, 4}, {161280, 1}, {161792, 3}, {172032, 2}, {193536, 1}, {259072, 6}, {286720, 7}, {294912, 6}, {301056, 4}, {322560, 3}, {344064, 2}, {387072, 1}, {393216, 4}, {404480, 3}, {409600, 1}, {417792, 3}, {425984, 1}, {524288, 6}, {530432, 7}, {557056, 6}, {566272, 5}, {577536, 4}, {593920, 6}, {602112, 5}, {614400, 4}, {645120, 3}, {647168, 4}, {652800, 1}, {654336, 6}, {673792, 3}, {688128, 2}, {724992, 4}, {727040, 1}, {753664, 2}, {783360, 4}, {816640, 6}, {831488, 1}, {851968, 2}, {860160, 3}, {868352, 2}, {881664, 7}, {884736, 1}, {921600, 7}, {950272, 1}, {LONG_MAX, 1}} diff -Nru gmp-ecm-7.0.4+ds/mul_fft-params.h.pentium4 gmp-ecm-7.0.5+ds/mul_fft-params.h.pentium4 --- gmp-ecm-7.0.4+ds/mul_fft-params.h.pentium4 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/mul_fft-params.h.pentium4 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,5 @@ +#define MUL_FFT_MODF_THRESHOLD 480 +#define SQR_FFT_MODF_THRESHOLD 480 +#define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {305, 5 /*95*/}, {321, 4 /*97*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {801, 6 /*96*/}, {1281, 7 /*91*/}, {1409, 6 /*97*/}, {1601, 7 /*92*/}, {1921, 6 /*98*/}, {1985, 7 /*94*/}, {2689, 8 /*91*/}, {2817, 7 /*95*/}, {3201, 8 /*92*/}, {3329, 7 /*96*/}, {3457, 8 /*87*/}, {3841, 7 /*96*/}, {3969, 8 /*88*/}, {4865, 7 /*97*/}, {4993, 8 /*90*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*83*/}, {11777, 8 /*97*/}, {12033, 9 /*85*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {15873, 8 /*98*/}, {16129, 9 /*88*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {26113, 10 /*81*/}, {31745, 9 /*98*/}, {34305, 10 /*85*/}, {39937, 9 /*98*/}, {40449, 10 /*83*/}, {48129, 11 /*75*/}, {63489, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {129025, 9 /*98*/}, {130561, 11 /*80*/}, {194561, 12 /*75*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 9 /*99*/}, {278017, 10 /*94*/}, {293889, 9 /*99*/}, {294401, 7 /*99*/}, {294529, 8 /*99*/}, {294657, 10 /*94*/}, {310273, 9 /*99*/}, {310785, 10 /*95*/}, {326657, 12 /*83*/}, {389121, 13 /*75*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {662529, 11 /*96*/}, {686081, 10 /*99*/}, {687105, 9 /*99*/}, {687617, 11 /*95*/}, {718849, 10 /*99*/}, {752641, 9 /*99*/}, {753153, 11 /*95*/}, {784385, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {980993, 10 /*99*/}, {982017, 12 /*93*/}, {LONG_MAX, 0}} +#define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {273, 5 /*94*/}, {289, 4 /*97*/}, {305, 5 /*95*/}, {609, 6 /*95*/}, {641, 5 /*97*/}, {673, 6 /*95*/}, {705, 5 /*97*/}, {737, 6 /*96*/}, {1473, 7 /*96*/}, {1537, 6 /*98*/}, {1601, 7 /*96*/}, {1665, 6 /*98*/}, {1729, 7 /*96*/}, {2689, 8 /*91*/}, {2817, 7 /*97*/}, {2945, 8 /*92*/}, {3329, 7 /*98*/}, {3457, 8 /*93*/}, {5377, 9 /*91*/}, {5633, 8 /*95*/}, {6401, 9 /*92*/}, {6657, 8 /*96*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {7937, 9 /*88*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {24065, 10 /*85*/}, {27649, 11 /*87*/}, {30721, 10 /*96*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {39937, 11 /*83*/}, {47105, 10 /*97*/}, {48129, 12 /*75*/}, {61441, 11 /*96*/}, {63489, 10 /*98*/}, {68609, 11 /*85*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 12 /*85*/}, {323585, 10 /*99*/}, {326657, 9 /*99*/}, {327169, 10 /*95*/}, {330753, 12 /*84*/}, {389121, 10 /*99*/}, {392193, 9 /*99*/}, {392705, 10 /*96*/}, {408577, 9 /*99*/}, {409089, 8 /*99*/}, {409345, 10 /*96*/}, {412673, 12 /*90*/}, {454657, 13 /*87*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {555009, 10 /*99*/}, {556033, 9 /*99*/}, {556545, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {654337, 11 /*95*/}, {686081, 13 /*87*/}, {778241, 11 /*99*/}, {817153, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {915457, 12 /*93*/}, {978945, 14 /*93*/}, {LONG_MAX, 0}} +#define MUL_FFT_FULL_TABLE2 {{100, 2}, {216, 1}, {256, 2}, {264, 1}, {304, 2}, {312, 1}, {544, 4}, {560, 1}, {704, 2}, {720, 1}, {896, 2}, {960, 7}, {40960, 2}, {47616, 1}, {49152, 6}, {53760, 4}, {56320, 1}, {64512, 4}, {71680, 5}, {86016, 2}, {96768, 4}, {99840, 1}, {131072, 6}, {136192, 7}, {147456, 6}, {150528, 4}, {161280, 1}, {161792, 3}, {172032, 2}, {193536, 1}, {259072, 6}, {286720, 7}, {294912, 6}, {301056, 4}, {322560, 3}, {344064, 2}, {387072, 1}, {393216, 4}, {404480, 3}, {409600, 1}, {417792, 3}, {425984, 1}, {524288, 6}, {530432, 7}, {557056, 6}, {566272, 5}, {577536, 4}, {593920, 6}, {602112, 5}, {614400, 4}, {645120, 3}, {647168, 4}, {652800, 1}, {654336, 6}, {673792, 3}, {688128, 2}, {724992, 4}, {727040, 1}, {753664, 2}, {783360, 4}, {816640, 6}, {831488, 1}, {851968, 2}, {860160, 3}, {868352, 2}, {881664, 7}, {884736, 1}, {921600, 7}, {950272, 1}, {LONG_MAX, 1}} diff -Nru gmp-ecm-7.0.4+ds/multiecm.c gmp-ecm-7.0.5+ds/multiecm.c --- gmp-ecm-7.0.4+ds/multiecm.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/multiecm.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,2511 @@ +/* multiecm.c - ECM with many curves with many torsion and/or in parallel + Author: F. Morain +*/ + +#include +#include +#include +#include + +#include /* GMP header file */ + +#include "ecm.h" /* ecm header file */ +#include "ecm-impl.h" +#include "ecm-ecm.h" +#include "mpmod.h" +#include "getprime_r.h" + +#ifdef HAVE_ADDLAWS +#include "addlaws.h" +#endif + +#ifdef HAVE_TORSION +#include "torsions.h" +#endif + +#define DEBUG_MULTI_EC 0 +#define MULTI_USE_ADD_SUB 1 + +#define NCURVE_MAX 2000 + +/* Morain/Olivos + OUTPUT: 1 if ok, 0 if pb (bad state reached). + SIDE EFFECT: fills in T[0..IT[ +*/ +int +MO_automaton(signed char *T, size_t *IT, mpz_t e, size_t le) +{ + size_t ie, iT = 0; + int state = 0, res = 1, bz; + + for(ie = 0; ie < le; ie++){ + bz = ecm_tstbit(e, ie) == 0; + switch(state){ + case 0: + if(bz) + T[iT++] = 0; + else + state = 1; + break; + case 1: + if(bz){ + T[iT++] = 1; + T[iT++] = 0; + state = 0; + } + else{ + T[iT++] = -1; + T[iT++] = 0; + state = 11; + } + break; + case 11: + if(bz) + state = 110; + else{ + T[iT++] = 0; + state = 11; + } + break; + case 110: + if(bz){ + T[iT++] = 1; + T[iT++] = 0; + state = 0; + } + else{ + T[iT++] = -1; + T[iT++] = 0; + state = 11; + } + } + } + if(state == 1 || state == 11) + T[iT++] = 1; + else if(state == 110) + res = 0; + *IT = iT; + return res; +} + +/* Do we have eval(T) == e? */ +int +MO_check(signed char *T, size_t iT, mpz_t e) +{ + mpz_t tmp; + int i, ok; + + printf("# Entering MO_check...\n"); + mpz_init_set_ui(tmp, 0); + for(i = ((int)iT)-1; i >= 0; i--){ + mpz_mul_2exp(tmp, tmp, 1); + mpz_add_si(tmp, tmp, (int)T[i]); + } +#if DEBUG_ADD_LAWS >= 2 + gmp_printf("e:=%Zd;\n", e); + gmp_printf("t:=%Zd;\n", tmp); +#endif + ok = mpz_cmp(tmp, e) == 0; + mpz_clear(tmp); + return ok; +} + +/* Do we have eval(T) == e? */ +int +Split_check(short *S, size_t iS, mpz_t e) +{ + mpz_t tmp; + size_t i; + int ok; + + mpz_init_set_ui(tmp, 0); + for(i = 0; i < iS; i += 2){ + mpz_add_si(tmp, tmp, (int)S[i+1]); + mpz_mul_2exp(tmp, tmp, (int)S[i]); + } +#if DEBUG_ADD_LAWS >= 2 + gmp_printf("e:=%Zd;\n", e); + gmp_printf("t:=%Zd;\n", tmp); +#endif + ok = mpz_cmp(tmp, e) == 0; + mpz_clear(tmp); + return ok; +} + +/* Adapted from Koyama and Tsuroka, CRYPTO'92, using less space. + S is filled in left-to-right from T. +*/ +size_t +Split(short *S, size_t Slen, signed char *T, size_t iT, int w0) +{ + size_t i = iT-1, j, k, lW, iS = 0, w = (size_t)w0; + int gap; + short W, nomorei = 0; + + while(i >= w-1){ + /* next scan: T[i-w+1..i] */ + /* exclude right zeros */ + gap = 0; + for(j = i-w+1; j <= i; j++){ + if(T[j] != 0) + break; + gap++; + } + lW = i-j+1; + W = 0; + /* at this point, T[j] <> 0 */ + for(k = j; k <= i; k++) + W += T[k]*(((short)1<<(k-j))); +#if 0 + i = i-w; // new i >= -1 +#endif + if(i >= w){ + i = i-w; + /* exclude left zeros and update power of 2 */ +#if 0 + while((i >= 0) && (T[i] == 0)){ +#endif + while(T[i] == 0){ + gap++; + if(i == 0){ + nomorei = 1; + break; + } + i--; + } + } + else{ + // case i = w-1 + nomorei = 1; + } + S[iS] = gap; + S[iS+1] = W; + if(iS >= 2) + S[iS-2] += lW; + iS += 2; + if(iS > Slen) + return 0; + if(nomorei) + break; + } + /* at this point, we have to examine T[0..i] */ + if(nomorei == 0){ + /* exclude right zeros */ + gap = 0; + for(j = 0; j <= i; j++){ + if(T[j] != 0) + break; + gap++; + } + lW = i-j+1; + W = 0; + /* at this point, T[j] <> 0 */ + for(k = j; k <= i; k++) + W += T[k]*(((short)1) << (k-j)); + S[iS] = gap; + S[iS+1] = W; + if(iS >= 2) + S[iS-2] += lW; + iS += 2; + if(iS > Slen) + return 0; + } + return iS; +} + +/* + OUTPUT: iS such that S[0..iS[ was filled + -1 if Slen is too small + The Solinas version is too slow for big entries, since it requires too + many shifts. + At the end of the process, we will have written + e = 2^t0 * (2*d0+1 + 2^t1 *(2*d1+1 + 2^t2 * (2*d2+1+... + 2^ts*(2*ds+1) ) + where ti >= w and -2^(w-1)+1 <= 2*di+1 < 2^(w-1)+1. + S will contain: [[ts, 2*ds+1], ..., [t1, 2*d1+1], [t0, 2*d0+1]]. +*/ +size_t +build_MO_chain(short *S, size_t Slen, mpz_t e, int w) +{ + /* first use automata */ + size_t le = mpz_sizeinbase(e, 2), iT = 0, iS = 0; +#if DEBUG_ADD_LAWS >= 0 + long tp = cputime(); + size_t i; +#endif + signed char *T = (signed char *)malloc((2*le) * sizeof(signed char)); /* humf */ + + MO_automaton(T, &iT, e, le); + if(iT > (2*le)){ + printf("#!# Error: iT too large %lu\n", iT); + } +#if DEBUG_ADD_LAWS >= 2 + /* check value of T */ + gmp_printf("# signed digits(%Zd):", e); + for(i = 0; i < (int)iT; i++) + printf(" %d", T[i]); + printf("\n"); +#endif +#if DEBUG_ADD_LAWS >= 2 + if(MO_check(T, iT, e) == 0) + printf("#!# Error in MO\n"); + else + printf("# good check in MO\n"); + printf("# le = %ld, iT = %ld, time = %ldms\n",le,iT,elltime(tp,cputime())); + tp = cputime(); +#endif + /* compact T to fill in S */ + iS = Split(S, Slen, T, iT, w); +#if DEBUG_ADD_LAWS >= 2 + printf("# time = %ldms\n", elltime(tp, cputime())); + printf("S ="); + for(i = 0; i < iS; i++) + printf(" %d", S[i]); + printf("\n"); +#endif +#if DEBUG_ADD_LAWS >= 2 + if(Split_check(S, iS, e) == 0){ + printf("#!# Error in Split\n"); + exit(-1); + } + else + printf("# good check in Split\n"); + +#endif + free(T); + return iS; +} + +size_t +build_add_sub_chain(short *S, size_t Slen, mpz_t e, int w) +{ + return build_MO_chain(S, Slen, e, w); +} + +#define EC_ADD_SUB_WMAX 10 +#define EC_ADD_SUB_2_WMAX (1 << EC_ADD_SUB_WMAX) + +/* TODO: do better */ +int +get_add_sub_w(mpz_t e) +{ + size_t l = mpz_sizeinbase(e, 2); + + if(l <= 16) + return 2; + else if(l <= 32) + return 3; + else if(l <= 128) + return 4; + else if(l <= 1024) + return 5; + else if(l <= 10240) + return 6; + else if(l <= 102400) + return 7; + else if(l <= 1024000) + return 7; + else + return 7; +} + +/* pack everybody */ +void +add_sub_pack(mpz_t s, int w, short *S, size_t iS) +{ + size_t nsh, cte = sizeof(mp_limb_t)/sizeof(short); + size_t cte2 = ((size_t) 1) << 16; + unsigned short *tmp; + + nsh = iS / cte; + if(iS % cte != 0) + nsh++; + nsh *= cte; + nsh += 4; + /* coding */ + tmp = (unsigned short *)malloc(nsh * sizeof(unsigned short)); + tmp[0] = w; + tmp[1] = iS / cte2; + tmp[2] = iS % cte2; + memcpy(tmp+4, S, iS * sizeof(unsigned short)); + s->_mp_d = (mp_limb_t *)tmp; /* humf */ +} + +void add_sub_unpack(int *w, short **S, size_t *iS, mpz_t s) +{ + unsigned short *T; + + T = (unsigned short *)s->_mp_d; /* humf */ + *w = (int)T[0]; + *iS = (size_t)((((size_t)T[1]) << 16) + (size_t)T[2]); + *S = (short *)(T+4); +#if DEBUG_ADD_LAWS >= 2 + printf("# iS_unpack = %lu\n", *iS); +#endif +} + +/* INPUT: S = [[ts, 2*ds+1], ..., [t1, 2*d1+1], [t0, 2*d0+1]] for + e = 2^t0 * (2*d0+1 + 2^t1 *(2*d1+1 + 2^t2 * (2*d2+1+... + 2^ts*(2*ds+1) ), + with -2^(w-1)+1 <= 2*di+1 < 2^{w-1}. +*/ +int +ell_point_mul_add_sub_with_S(mpz_t f, ell_point_t Q, ell_point_t P, + ell_curve_t E, mpmod_t n, int w, short *S, int iS) +{ + ell_point_t P0; + ell_point_t iP[EC_ADD_SUB_2_WMAX]; + size_t j; + int status = 1, i, k; +#if DEBUG_ADD_LAWS >= 2 + mpz_t ex; + unsigned long cpt = 0; + long tp; + ell_point_t QQ; + + tp = cputime(); + /* to reconstruct the exponent to check mults */ + mpz_init_set_ui(ex, 0); + ell_point_init(QQ, E, n); +#endif + + /* iP[i] <- (2*i+1) * P */ + k = (1 << (w-1)) - 1; + for(i = 0; i <= k; i++) + ell_point_init(iP[i], E, n); + ell_point_set(iP[0], P, E, n); +#if DEBUG_ADD_LAWS >= 2 + ell_point_check(iP[0], E, n); + gmp_printf("N:=%Zd;\n", n->orig_modulus); + printf("E:="); ell_curve_print(E, n); printf(";\n"); + printf("P:="); ell_point_print(P, E, n); printf(";\n"); +#endif + if(k > 0){ + /* P[k] <- [2]*P */ + if(ell_point_duplicate(f, iP[k], P, E, n) == 0){ + mpres_set(P0->x, iP[k]->x, n); + status = 0; + goto ell_point_mul_add_sub_end; + } +#if DEBUG_ADD_LAWS >= 2 + ell_point_check(iP[k], E, n); +#endif + for(i = 1; i <= k; i++){ + if(ell_point_add(f, iP[i], iP[i-1], iP[k], E, n) == 0){ + status = 0; + goto ell_point_mul_add_sub_end; + } +#if DEBUG_ADD_LAWS >= 2 + ell_point_check(iP[i], E, n); +#endif + } + /* at this point, P[i] = (2*i+1) P */ + } + + ell_point_init(P0, E, n); + ell_point_set_to_zero(P0, E, n); + +#if DEBUG_ADD_LAWS >= 2 + printf("P:="); ell_point_print(P, E, n); printf(";\n"); +#endif + /* S = [[ts, 2*ds+1], ... */ + for(j = 0; j < iS; j += 2){ +#if DEBUG_ADD_LAWS >= 2 + printf("P0:="); ell_point_print(P0, E, n); printf(":\n"); +#endif + i = abs(S[j+1]) >> 1; /* (abs(S[j+1])-1)/2, S[j+1] is always odd */ + assert(i <= k); + if(S[j+1] > 0){ + if(ell_point_add(f, P0, P0, iP[i], E, n) == 0){ + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("iP%d:=", i); ell_point_print(iP[i], E, n); printf(":\n"); + printf("Radd:="); ell_point_print(P0, E, n); printf(":\n"); + printf("Q:=ProjEcmAdd(P0, iP%d, E, N): ", i); + printf("printf(\"CHK%lu: %%a\\n\", ProjEcmEqual(Q, Radd, N));\n", + cpt++); +#endif + } + else{ + /* add(-P) = sub(P) */ + if(ell_point_sub(f, P0, P0, iP[i], E, n) == 0){ + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("iP%d:=", i); ell_point_print(iP[i], E, n); printf(":\n"); + printf("Rsub:="); ell_point_print(P0, E, n); printf(":\n"); + printf("S:=ProjEcmNegate(iP%d, N):\n", i); + printf("Q:=ProjEcmAdd(P0, S, E, N): "); + printf("printf(\"CHK%lu: %%a\\n\", ProjEcmEqual(Q, Rsub, N));\n", + cpt++); +#endif + } +#if DEBUG_ADD_LAWS >= 2 + ell_point_check(P0, E, n); +#endif + /* now multiply */ + for(i = 0; i < S[j]; i++){ + if(ell_point_duplicate(f, P0, P0, E, n) == 0){ + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Rdup:="); ell_point_print(P0, E, n); printf(":\n"); + printf("Q:=ProjEcmDouble(P0, E, N): "); + printf("printf(\"CHK%lu: %%a\\n\", ProjEcmEqual(Q, Rdup, N));\n", + cpt++); +#endif + } +#if DEBUG_ADD_LAWS >= 2 + mpz_add_si(ex, ex, (int)S[j+1]); + mpz_mul_2exp(ex, ex, (int)S[j]); +#endif +#if DEBUG_ADD_LAWS >= 2 + ell_point_check(P0, E, n); + ell_point_mul_plain(QQ, ex, P, E, n); + if(ell_point_equal(QQ, P0, E, n) == 0){ + gmp_printf("ex:=%Zd;\n", ex); + printf("P0:="); ell_point_print(P0, E, n); printf(";\n"); + printf("QQ:="); ell_point_print(QQ, E, n); printf(";\n"); + exit(-1); + } +#endif + if(status == 0) + break; + } + ell_point_mul_add_sub_end: +#if DEBUG_ADD_LAWS >= 2 + printf("# time[addsub%d] = %ldms\n", w, elltime(tp, cputime())); + printf("Checking with [ex]*P with %ld bits\n", (long)mpz_sizeinbase(ex,2)); + tp = cputime(); + ell_point_mul_plain(QQ, ex, P, E, n); + if(ell_point_equal(QQ, P0, E, n) == 0){ + gmp_printf("ex:=%Zd;\n", ex); + printf("P0:="); ell_point_print(P0, E, n); printf(";\n"); + printf("QQ:="); ell_point_print(QQ, E, n); printf(";\n"); + exit(-1); + } + printf("# time[plain] = %ldms\n", elltime(tp, cputime())); + mpz_clear(ex); +#endif + ell_point_set(Q, P0, E, n); + ell_point_clear(P0, E, n); + for(i = 0; i <= k; i++) + ell_point_clear(iP[i], E, n); + return status; +} + +/* multiply P=(x:y:z) by e and puts the result in Q. + Return value: 0 if a factor is found, and the factor is in Q->x, + 1 otherwise. + See Solinas 2000 for the most plug-and-play presentation. +*/ +int +ell_point_mul_add_sub(mpz_t f, ell_point_t Q, mpz_t e, ell_point_t P, + ell_curve_t E, mpmod_t n) +{ + size_t iS = 0, Slen, w; + int negated = 0, status = 1; +#if DEBUG_ADD_LAWS >= 2 + int j; +#endif + short *S; + + if(ell_point_is_zero(P, E, n)){ + ell_point_set(Q, P, E, n); + return 1; + } + + if(mpz_sgn(e) == 0){ + ell_point_set_to_zero(Q, E, n); + return 1; + } + + if(mpz_sgn (e) < 0){ + negated = 1; + mpz_neg(e, e); + ell_point_negate(P, E, n); + } + + if (mpz_cmp_ui(e, 1) == 0){ + ell_point_set(Q, P, E, n); + return 1; + } + + w = get_add_sub_w(e); + + Slen = 2 * mpz_sizeinbase(e, 2); +#if DEBUG_ADD_LAWS >= 2 + printf("# Slen=%lu\n", Slen); +#endif + S = (short *)malloc(Slen * sizeof(short)); + iS = build_add_sub_chain(S, Slen, e, w); + if(iS == 0){ + printf("build_NAF: Slen=%"PRIu64" too small\n", Slen); + return -1; + } +#if DEBUG_ADD_LAWS >= 2 + gmp_printf("addsub[%Zd=>%d]:", e, iS); + for(j = iS-1; j >= 0; j--) + printf(" %d", S[j]); + printf("\n"); + printf("P:="); ell_point_print(P, E, n); printf(";\n"); +#endif + status = ell_point_mul_add_sub_with_S(f, Q, P, E, n, w, S, iS); + free(S); +#if DEBUG_ADD_LAWS >= 2 + if(status == 0){ + printf("Not checking, since a factor was found!\n"); + } + else{ + ell_point_t PP; + mpz_t f; + int res; + + mpz_init(f); + ell_point_init(PP, E, n); + res = ell_point_mul_plain(PP, e, P, E, n); + if(res == 0){ + printf("Factor found during ell_point_mul_plain...!\n"); + } + else if(pt_w_cmp(Q->x, Q->y, Q->z, PP->x, PP->y, PP->z, n) != 1){ + mpz_gcd(f, PP->z, n->orig_modulus); + if(mpz_cmp_ui(f, 1) != 0){ + gmp_printf("non trivial gcd from plain: %Zd\n", f); + mpz_gcd(f, Q->z, n->orig_modulus); // FIXME: why? + gmp_printf("gcd from addsub: %Zd\n", f); + } + else{ + printf("PB\n"); + gmp_printf("N:=%Zd;\n", n->orig_modulus); + gmp_printf("e:=%Zd;\n", e); + printf("P:="); ell_point_print(P, E, n); printf(";\n"); + printf("x0:=P[1]/P[3] mod N; y0:=P[2]/P[3] mod N;\n"); + ell_curve_print(E, n); printf("E:=E mod N;\n"); + printf("addsub:="); ell_point_print(Q, E, n); printf(";\n"); + printf("plain:="); ell_point_print(PP, E, n); printf(";\n"); + exit(-1); + } + } + ell_point_clear(PP, E, n); + mpz_clear(f); + } +#endif + /* Undo negation to avoid changing the caller's e value */ + if (negated){ + ell_point_negate(P, E, n); + mpz_neg(e, e); + } + return status; +} + +int * +compute_forbidden_res(int disc) +{ + int *t = NULL; + + if(disc == 0) + return NULL; + if(disc == -3){ + /* we do not want p = 2 mod 3 */ + t = (int *)malloc(3 * sizeof(int)); + t[0] = 3; + t[1] = 2; + t[2] = -1; + } + else if(disc == -4){ + /* we do not want p = 3 mod 4 */ + t = (int *)malloc(3 * sizeof(int)); + t[0] = 4; + t[1] = 3; + t[2] = -1; + } + else if(disc == -8){ + /* (-2/p) = -1 <=> p = 5 or 7 mod 8 */ + t = (int *)malloc(4 * sizeof(int)); + t[0] = 8; + t[1] = 5; + t[2] = 7; + t[3] = -1; + } + else if(disc == -7 || disc == -11){ + /* (-d/p) = (p/d) when d is 3 mod 4 */ + int x, i, d = -disc; + + /* initialize */ + t = (int *)malloc(d * sizeof(int)); + memset(t, 0, d * sizeof(int)); + /* crude, but sufficient */ + for(x = 0; x < d; x++) + t[(x*x)%d] = 1; + /* x = 0 is always ok */ + t[0] = d; + i = 1; + for(x = 1; x < d; x++) + if(t[x] == 0) + t[i++] = x; + t[i++] = -1; +#if 0 + for(x = 0; x < i; x++) + printf(" %d", t[x]); + printf("\n"); +#endif + } + return t; +} + +/* We can probably hack so that s contains the coding of a NAF, containing + w, iS, S. +*/ +int +compute_s_4_add_sub(mpz_t s, ecm_uint B1, int disc) +{ + mpz_t t; + long tp; + short *S; + size_t Slen, iS; + int w, *forbiddenres = compute_forbidden_res(disc); + + mpz_init(t); + tp = cputime(); + compute_s(t, B1, forbiddenres); + free(forbiddenres); + printf("# computing prod(p^e <= %lu): %ldms\n", B1, elltime(tp,cputime())); +#if USE_ADD_SUB_CHAINS == 0 /* keeping it simple for the time being */ + mpz_set(s, t); +#else + tp = cputime(); + w = get_add_sub_w(t); + /* Slen = 2 * log_{2^w}(t) = 2*log_2(t)/w = 2 * 64 * size(t)/w */ + Slen = (2 * GMP_NUMB_BITS * mpz_size(t)) / w; + S = (short *)malloc(Slen * sizeof(short)); + iS = build_add_sub_chain(S, Slen, t, w); + printf("# NAF has %"PRIu64" terms (w=%d, Slen=%"PRIu64"): %ldms\n", iS, w, Slen, + elltime(tp,cputime())); + if(iS == 0){ + printf("build_NAF: Slen=%"PRIu64" too small\n", Slen); + return 0; + } + add_sub_pack(s, w, S, iS); + free(S); +#endif + mpz_clear(t); + return 1; +} + +/* fall back on traditional ECM. + TODO: use chkfile also. + */ +int +process_one_curve(mpz_t f, mpz_t N, double B1, mpz_t B2, + ecm_params params, ell_curve_t E, ell_point_t P) +{ + int ret; + + /* if B2 = ECM_DEFAULT_B2, compute it automatically from B1: + no freedom on B2! */ + mpz_set(params->B2, B2); + /* will be set to B1 */ + mpz_set_si(params->B2min, ECM_DEFAULT_B2); + + mpz_set(params->x, P->x); + mpz_set(params->sigma, E->a4); /* humf */ + + if(E->type == ECM_EC_TYPE_MONTGOMERY) + params->sigma_is_A = 1; + else{ + params->sigma_is_A = -1; + mpz_set(params->y, P->y); + } + params->E = E; + + ret = ecm_factor(f, N, B1, params); + return ret; +} + +/* OUTPUT: ECM_PRIME_FAC_PRIME_COFAC if f prp, N/f prp + ECM_PRIME_FAC_COMP_COFAC if f prp, N/f composite + ECM_COMP_FAC_PRIME_COFAC if f composite, N/f prp + ECM_COMP_FAC_COMP_COFAC if f composite, N/f composite + */ +int +conclude_on_factor(mpz_t N, mpz_t f, int verbose) +{ + mpz_t C; + int factor_is_prime, cofactor_is_prime, ret; + + if(mpz_cmp(N, f) == 0){ + printf("# found input number\n"); + return ECM_INPUT_NUMBER_FOUND; + } + factor_is_prime = mpz_probab_prime_p (f, PROBAB_PRIME_TESTS); + mpz_init(C); + mpz_tdiv_q(C, N, f); + cofactor_is_prime = mpz_probab_prime_p (C, PROBAB_PRIME_TESTS); + if (factor_is_prime) + ret = cofactor_is_prime ? ECM_PRIME_FAC_PRIME_COFAC : + ECM_PRIME_FAC_COMP_COFAC; + else + ret = cofactor_is_prime ? ECM_COMP_FAC_PRIME_COFAC : + ECM_COMP_FAC_COMP_COFAC; + if (verbose >= 1) + { + printf ("Found %s factor of %u digits: ", + factor_is_prime ? "probable prime" : "composite", + nb_digits (f)); + mpz_out_str (stdout, 10, f); + printf ("\n"); + printf ("%s cofactor ", + cofactor_is_prime ? "Probable prime" : "Composite"); + mpz_out_str (stdout, 10, C); + printf (" has %u digits\n", nb_digits(C)); + } + mpz_clear(C); + return ret; +} + +#if DEBUG_MULTI_EC >= 2 +/* f is a (probable) prime factor of n. tP is in plain mod n form. */ +void +dump_curves(ell_curve_t *tE, ell_point_t *tP, int nE, mpz_t f) +{ + int i; + + printf("CheckE:=procedure(E, D, P, info)\n"); + printf(" K:=QuadraticField(D); OK:=MaximalOrder(K);\n"); + printf(" printf \"#E[%%o]=%%o\\n\", info, Factorization(#E);\n"); + printf(" tw:=Twists(E); Et:=tw[2];\n"); + printf(" printf \"#Et[%%o]=%%o\\n\", info, Factorization(#Et);\n"); + printf(" gen:=Generators(E); printf \"ords=%%o\\n\", "); + printf("[Factorization(Order(g)):g in gen];\n"); + printf(" lf:=Factorization(Order(E!P)); printf \"ord(P)=%%o\\n\", lf;\n"); + printf(" for i:=1 to #lf do\n"); + printf(" lfi:=Factorization(lf[i][1]*OK);\n"); + printf(" ok,gen:=IsPrincipal(lfi[1][1]); print lf[i], ok, gen;\n"); + printf(" end for;\n"); + printf("end procedure;\n"); + gmp_printf("p:=%Zd; F:=GF(p); P:=[]; A:=[]; B:=[]; E:=[]; D:=[];\n", f); + for(i = 0; i < nE; i++){ + printf("D[%d]:=%d;\n", i+1, tE[i]->disc); + if(tE[i]->type == ECM_EC_TYPE_MONTGOMERY){ + mpmod_t fmod; + mpres_t x, y, A; + mpz_t tmp; + + mpz_init(tmp); + mpmod_init(fmod, f, ECM_MOD_DEFAULT); + mpres_init(x, fmod); + mpres_set_z(x, tP[i]->x, fmod); + mpres_init(y, fmod); + mpres_init(A, fmod); + mpres_set_z(A, tE[i]->a4, fmod); + if(montgomery_to_weierstrass(tmp, x, y, A, fmod) + == ECM_FACTOR_FOUND_STEP1){ + printf("GASP while dumping a Montgomery form curve!\n"); + } + printf("P[%d]:=[", i+1); print_mpz_from_mpres(x, fmod); + printf(", "); print_mpz_from_mpres(y,fmod); + printf(", 1];\n"); + printf("A[%d]:=", i+1); + print_mpz_from_mpres(A, fmod); + printf(";\n"); + printf("B[%d]:=(P[%d][2]^2-P[%d][1]^3-A*P[%d][1]) mod N;\n", + i+1, i+1, i+1, i+1); + mpres_clear(x, fmod); + mpres_clear(y, fmod); + mpres_clear(A, fmod); + mpmod_clear(fmod); + mpz_clear(tmp); + } + else if(tE[i]->type == ECM_EC_TYPE_WEIERSTRASS){ + gmp_printf("P[%d]:=[%Zd, %Zd, %Zd];\n", i+1, + tP[i]->x, tP[i]->y, tP[i]->z); + gmp_printf("A[%d]:=%Zd;\n", i+1, tE[i]->a4); + gmp_printf("B[%d]:=%Zd;\n", i+1, tE[i]->a6); + } + else{ + printf("Case %d NYI in dump_curves\n", tE[i]->type); + break; + } + printf("E[%d]:=EllipticCurve([F!A[%d], F!B[%d]]);\n", i+1, i+1, i+1); + printf("CheckE(E[%d], D[%d], P[%d], infos[%d]);\n",i+1,i+1,i+1,i+1); + } +} +#endif /* DEBUG_MULTI_EC >= 2 */ + +/* TODO: better control of B2 + dichotomy (cf. #B2) */ +int +one_curve_at_a_time(mpz_t f, char *ok, ell_curve_t *tE, ell_point_t *tP, int nE, + mpz_t N, ecm_params params, double B1, mpz_t B2, + char *savefilename) +{ + double tmpB1, tmpB2, B2g = 0, B2d = 0, dB2 = 0; /* 1e9; #B2 */ + int ret = 0, i, saveit, nhit, nhitmax = 1; /* #B2 */ + mpcandi_t candi; + char comment[256] = ""; + mpz_t C; + + mpcandi_t_init(&candi); + mpcandi_t_add_candidate(&candi, N, NULL, 0); + mpz_init(C); + /* process curves one at a time */ + for(i = 0; i < nE; i++){ + tmpB1 = B1; + tmpB2 = dB2; + nhit = 0; + while(1){ +#if DEBUG_MULTI_EC >= 2 + printf("infos:=[\"E%d\"];\n", i); + dump_curves(tE+i, tP+i, 1, N); +#endif + params->B1done = 1.0; +#if 0 /* #B2 */ + mpz_set_d(params->B2, tmpB2); +#endif + if(nhit > 0){ + tmpB2 = (B2d+B2g)/2; + printf("# trying new B2[%d]=%f\n", nhit, tmpB2); + } + ret = process_one_curve(f,N,tmpB1,B2,params,tE[i],tP[i]); + if(ret == ECM_NO_FACTOR_FOUND){ + if(nhit == 0) + /* no factor found in any step */ + break; + else{ + /* we are in some recursive step */ + printf("# B1done=%.0f\n", params->B1done); + if(params->B1done == tmpB1) + /* dichotomy for step 2 */ + B2g = tmpB2; + } + } + else if(ret == ECM_FACTOR_FOUND_STEP1){ + if(mpz_cmp(f, N) != 0) + /* non-trivial factor found */ + break; + else{ + tmpB1 = params->B1done - 1; + printf("# trying again with B1=%.0f\n", tmpB1); + } + } + else if(ret == ECM_FACTOR_FOUND_STEP2){ + if(mpz_cmp(f, N) != 0) + /* non-trivial factor found */ + break; + else{ + if(nhit == 0) + B2g = 0; + B2d = tmpB2; + } + } + else + break; + nhit++; + if(nhit == nhitmax) /* caution, Lemmy! */ + break; + } + saveit = (savefilename != NULL); + if(ret > 0){ /* humf */ + ok[i] = 0; + ret = conclude_on_factor(N, f, params->verbose); + if(ret == ECM_INPUT_NUMBER_FOUND){ + printf("# B1done=%.0f\n", params->B1done); + printf("# proceeding to next curve\n"); + saveit = 0; + } + else{ +#if DEBUG_MULTI_EC >= 2 + if(ret == ECM_PRIME_FAC_PRIME_COFAC + || ret == ECM_PRIME_FAC_COMP_COFAC){ + /* output Magma lines to check #E's mod f */ + printf("infos:=[\"E%d\"];\n", i); + dump_curves(tE+i, tP+i, 1, f); + } +#endif + break; + } + } + else if(ret == ECM_ERROR){ + printf("Error for curve %d\n", i); + } + if(saveit){ + write_resumefile(savefilename, ECM_ECM, N, params, &candi, + tP[i]->x, tP[i]->y, comment); + } + + } +#if DEBUG_MULTI_EC >= 2 + printf("# let's debug all curves\n"); + dump_curves(tE, tP, nE, N); +#endif + mpz_clear (C); + mpcandi_t_free(&candi); + return ret; +} + +/********************************************************************** + Using parallelism. +**********************************************************************/ + +/********** group law on points **********/ + +int +pt_is_zero(ell_point_t P, ATTRIBUTE_UNUSED mpmod_t n) +{ + return mpz_sgn(P->z) == 0; +} + +void +pt_set_to_zero(ell_point_t P, mpmod_t n) +{ + mpz_set_ui(P->x, 0); + mpres_set_ui(P->y, 1, n); + mpz_set_ui(P->z, 0); +} + +void +pt_assign(ell_point_t Q, ell_point_t P, ATTRIBUTE_UNUSED mpmod_t n) +{ + mpres_set(Q->x, P->x, n); + mpres_set(Q->y, P->y, n); + mpres_set(Q->z, P->z, n); +} + +void +pt_neg(ell_point_t P, mpmod_t n) +{ + if(pt_is_zero(P, n) == 0) + mpres_neg(P->y, P->y, n); +} + +void +pt_many_set_to_zero(ell_point_t *tP, int nE, mpmod_t n) +{ + int i; + + for(i = 0; i < nE; i++) + pt_set_to_zero(tP[i], n); +} + +void +pt_many_neg(ell_point_t *tP, int nE, mpmod_t n) +{ + int i; + + for(i = 0; i < nE; i++) + pt_neg(tP[i], n); +} + +void +pt_many_assign(ell_point_t *tQ, ell_point_t *tP, int nE, mpmod_t n) +{ + int i; + + for(i = 0; i < nE; i++) + pt_assign(tQ[i], tP[i], n); +} + +void +pt_print(ell_curve_t E, ell_point_t P, mpmod_t n) +{ + printf("["); + print_mpz_from_mpres(P->x, n); + printf(", "); + print_mpz_from_mpres(P->y, n); + printf(", "); + if(E->type == ECM_EC_TYPE_WEIERSTRASS && E->law == ECM_LAW_AFFINE) + gmp_printf("%Zd", P->z); + else + print_mpz_from_mpres(P->z, n); + printf("]"); +} + +void +pt_many_print(ell_curve_t *tE, ell_point_t *tP, int nE, mpmod_t n) +{ + int i; + + for(i = 0; i < nE; i++){ + printf("%d: ", i); + pt_print(tE[i], tP[i], n); + printf(" on E.A="); + print_mpz_from_mpres(tE[i]->a4, n); + printf("\n"); + } +} + +/* Computes inv[i] = 1/x[i] using only one inversion, a la Montgomery. + If takeit[i] != 1, do not compute 1/x[i] (it is probably 0, or irrelevant). + We should have inv != x. + x[nx] is a buffer. + When a factor is found, the i s.t. x[i] is not invertible are looked for + and the corresponding values of takeit put to 2. +*/ +int +compute_all_inverses(mpz_t f, mpres_t *inv, mpres_t *x, int nx, mpmod_t n, char *takeit) +{ + int i; + +#if 0 + /* plain version, to debug the architecture */ + for(i = 0; i < nx; i++){ + if(takeit[i] != 1) + continue; + if(!mpres_invert(inv[i], x[i], n)){ + mpres_gcd(inv[0], x[i], n); // FIXME!! +#if DEBUG_ADD_LAWS >= 1 + printf("Factor[%d]: ", i); + mpz_out_str (stdout, 10, inv[0]); + printf ("\n"); +#endif + return 0; + } + } +#else + /* Montgomery's trick */ + for(i = 0; i < nx; i++){ + if(takeit[i] != 1){ + if(i == 0) + mpres_set_ui(inv[i], 1, n); + else + mpres_set(inv[i], inv[i-1], n); + } + else{ + if(i == 0) + mpres_set(inv[i], x[i], n); + else + mpres_mul(inv[i], inv[i-1], x[i], n); + } + } + /* invert */ + if(!mpres_invert(x[nx], inv[nx-1], n)){ + mpres_gcd(f, inv[nx-1], n); +#if DEBUG_ADD_LAWS >= 1 + printf("Factor[%d]: ", i); + mpz_out_str (stdout, 10, f); + printf ("\n"); +#endif + /* identifying the x[i]'s */ + for(i = 0; i < nx; i++){ + mpres_gcd(f, x[i], n); + if(mpz_cmp_ui(f, 1) != 0){ +#if DEBUG_ADD_LAWS >= 0 + printf("# x[%d] not invertible: ", i); + mpz_out_str (stdout, 10, f); + printf ("\n"); +#endif + /* ONE DAY: if x[nx] != inv[0], we have another factor! */ + takeit[i] = 2; + } + } + return 0; + } + /* get inverses back */ + /* say inv = 1/(x1*x2*x3) */ + for(i = nx-1; i > 0; i--) + if(takeit[i] == 1){ + mpres_mul(inv[i], x[nx], inv[i-1], n); /* 1/x3 = inv * (x1*x2) */ + mpres_mul(x[nx], x[nx], x[i], n); /* inv = 1/(x1*x2) */ + } + mpres_set(inv[0], x[nx], n); +#endif +#if DEBUG_ADD_LAWS >= 1 + /* printf("# checking inverses\n"); */ + mpres_t tmp; + mpres_init(tmp, n); + for(i = 0; i < nx; i++){ + mpres_mul(tmp, inv[i], x[i], n); + mpres_get_z(tmp, tmp, n); + if(mpz_cmp_ui(tmp, 1) != 0) + printf("ERROR in compute_all_inverses[%d]\n", i); + } + mpres_clear(tmp, n); +#endif + return 1; +} + +/* NOTE: we can have tR = tP or tQ. + In case a factor is found, it is put in num[nE]. + */ +int +pt_many_common(mpz_t f, ell_point_t *tR, ell_point_t *tP, ell_point_t *tQ, + int nE, mpmod_t n, mpres_t *num, mpres_t *den, mpres_t *inv, + char *takeit) +{ + int i; + + if(compute_all_inverses(f, inv, den, nE, n, takeit) == 0){ + mpz_set(num[nE], inv[0]); + return 0; + } + for(i = 0; i < nE; i++){ + if(takeit[i] != 1) + continue; + /* l:=(inv[i]*num[i]) mod N; */ + mpres_mul(num[i], num[i], inv[i], n); + /* x:=(l^2-P[1]-Q[1]) mod N; */ + mpres_sqr(den[i], num[i], n); + mpres_sub(den[i], den[i], tP[i]->x, n); + mpres_sub(den[i], den[i], tQ[i]->x, n); + /* tR[i]:=[x, (l*(P[1]-x)-P[2]) mod N, 1]; */ + mpres_sub(tR[i]->x, tP[i]->x, den[i], n); + mpres_mul(tR[i]->x, tR[i]->x, num[i], n); + mpres_sub(tR[i]->y, tR[i]->x, tP[i]->y, n); + mpres_set(tR[i]->x, den[i], n); + } + return 1; +} + +/* In case a factor is found, it is put in num[nE]. */ +int +pt_many_duplicate(mpz_t f, ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, + int nE, mpmod_t n, + mpres_t *num, mpres_t *den, mpres_t *inv, char *ok) +{ + char *takeit = (char *)malloc(nE * sizeof(char)); + int i, res; + + memcpy(takeit, ok, nE); + for(i = 0; i < nE; i++){ + if(ok[i] == 0) + continue; /* takeit[i] = 0 */ + if(pt_is_zero(tP[i], n)){ + takeit[i] = 0; + pt_set_to_zero(tQ[i], n); + } + else if(mpz_sgn(tP[i]->y) == 0){ + /* 2 * P[i] = O_E */ + takeit[i] = 0; + pt_set_to_zero(tP[i], n); + printf("# [2] * P[%d] = O_E\n", i); + } + else{ + mpres_sqr(num[i], tP[i]->x, n); + mpres_mul_ui(num[i], num[i], 3, n); + mpres_add(num[i], num[i], tE[i]->a4, n); + mpres_mul_ui(den[i], tP[i]->y, 2, n); + } + } + res = pt_many_common(f, tQ, tP, tP, nE, n, num, den, inv, takeit); + /* TODO: case takeit[i] == 2 */ + free(takeit); + return res; +} + +/* R[i] <- P[i] + Q[i], or a factor is found which is put in num[nE]. */ +int +pt_many_add(mpz_t f, ell_point_t *tR, ell_point_t *tP, ell_point_t *tQ, ell_curve_t *tE, + int nE, mpmod_t n, + mpres_t *num, mpres_t *den, mpres_t *inv, char *ok) +{ + char *takeit = (char *)malloc(nE * sizeof(char)); + int i, res; + + memcpy(takeit, ok, nE); +#if DEBUG_ADD_LAWS >= 2 + printf("In pt_many_add, adding\n"); + pt_many_print(tE, tP, nE, n); + printf("and\n"); + pt_many_print(tE, tQ, nE, n); +#endif + for(i = 0; i < nE; i++){ + if(ok[i] == 0) + continue; /* takeit[i] = 0 */ + if(pt_is_zero(tP[i], n)){ +#if DEBUG_ADD_LAWS >= 2 + printf("# tEP[%d] = O_{E[%d]}\n", i, i); +#endif + takeit[i] = 0; + pt_assign(tR[i], tQ[i], n); + } + else if(pt_is_zero(tQ[i], n)){ +#if DEBUG_ADD_LAWS >= 2 + printf("# tEQ[%d] = O_{E[%d]}\n", i, i); +#endif + takeit[i] = 0; + pt_assign(tR[i], tP[i], n); + } + else if(pt_is_equal(tP[i], tQ[i])){ + /* we should double */ + if(mpz_sgn(tP[i]->y) == 0){ +#if DEBUG_ADD_LAWS >= 2 + printf("# 2 * P[%d] = O_{E[%d]}\n", i, i); +#endif + takeit[i] = 0; + pt_set_to_zero(tR[i], n); + } + else{ + /* ordinary doubling */ + mpres_sqr(num[i], tP[i]->x, n); + mpres_mul_ui(num[i], num[i], 3, n); + mpres_add(num[i], num[i], tE[i]->a4, n); + mpres_mul_ui(den[i], tP[i]->y, 2, n); + } + } + else if(mpres_equal(tQ[i]->x, tP[i]->x, n)){ + mpres_add(num[i], tQ[i]->x, tP[i]->x, n); + if(mpz_sgn(num[i]) == 0){ + takeit[i] = 0; + pt_set_to_zero(tR[i], n); + } + } + else{ + mpres_sub(num[i], tQ[i]->y, tP[i]->y, n); + mpres_sub(den[i], tQ[i]->x, tP[i]->x, n); + } + } + res = pt_many_common(f, tR, tP, tQ, nE, n, num, den, inv, takeit); + /* TODO: case takeit[i] == 2 */ + free(takeit); + return res; +} + +/* tER != tEP */ +static int +pt_many_sub(mpz_t f, ell_point_t *tR, ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, + int nE, mpmod_t n, + mpres_t *num, mpres_t *den, mpres_t *inv, char *ok) +{ + int i, res; + + for(i = 0; i < nE; i++) + if(ok[i] == 1) + pt_neg(tP[i], n); + res = pt_many_add(f, tR, tQ, tP, tE, nE, n, num, den, inv, ok); + for(i = 0; i < nE; i++) + if(ok[i] == 1) + pt_neg(tP[i], n); + return res; +} + +/* Ordinary binary left-right addition */ +static int +pt_many_mul_plain(mpz_t f, ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, + int nE, mpz_t e, mpmod_t n, + mpres_t *num, mpres_t *den, mpres_t *inv, char *ok) +{ + size_t l = mpz_sizeinbase (e, 2) - 1; /* l >= 1 */ + int status = 1; + + pt_many_assign(tQ, tP, nE, n); + while (l-- > 0) + { + if(pt_many_duplicate (f, tQ, tQ, tE, nE, n, num, den, inv, ok) == 0) + { + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Rdup:="); pt_many_print(tE, tQ, nE, n); printf(";\n"); +#endif + if (ecm_tstbit (e, l)) + { + if(pt_many_add (f, tQ, tP, tQ, tE, nE, n, num, den, inv, ok) == 0) + { + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Radd:="); pt_many_print(tE, tQ, nE, n); printf(";\n"); +#endif + } + } + return status; +} + +/* Ordinary binary left-right addition; see Solinas00. Morally, we use + w = 2. */ +static int +pt_many_mul_add_sub_si(mpz_t f, ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, int nE, + long c, mpmod_t n, + mpres_t *num, mpres_t *den, mpres_t *inv, char *ok) +{ + long u, S[64]; + int j, iS = 0, status = 1; + ATTRIBUTE_UNUSED int w = 2; + + /* build NAF_w(c) */ + while(c > 0){ + if((c & 1) == 1){ + /* c is odd */ + u = c & (long)3; + if(u == 3) + u = -1; + } + else + u = 0; + S[iS++] = u; + c >>= 1; + } + /* use it */ + pt_many_set_to_zero(tQ, nE, n); + for(j = iS-1; j >= 0; j--){ + if(pt_many_duplicate(f, tQ, tQ, tE, nE, n, num, den, inv, ok) == 0){ + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Rdup:="); pt_many_print(tE, tQ, nE, n); printf(";\n"); +#endif + if(S[j] == 1){ + if(pt_many_add(f, tQ, tQ, tP, tE, nE, n, num, den, inv, ok) == 0){ + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Radd:="); pt_many_print(tE, tQ, nE, n); printf(";\n"); +#endif + } + else if(S[j] == -1){ + if(pt_many_sub(f, tQ, tQ, tP, tE, nE, n, num, den, inv, ok) == 0){ + status = 0; + break; + } +#if DEBUG_ADD_LAWS >= 2 + printf("Rsub:="); pt_many_print(tE, tQ, nE, n); printf(";\n"); +#endif + } + } + return status; +} + +/* tEQ[i] <- e * tEP[i]; we must have tEQ != tEP */ +/* If a factor is found, it is put back in num[nE]. */ +int +pt_many_mul(mpz_t f, ell_point_t *tQ, ell_point_t *tP, ell_curve_t *tE, int nE, + mpz_t e, mpmod_t n, + mpres_t *num, mpres_t *den, mpres_t *inv, char *ok) +{ + size_t l; + int negated = 0, status = 1; + + if (mpz_sgn (e) == 0) + { + pt_many_set_to_zero(tQ, nE, n); + return 1; + } + + /* The negative of a point (x:y:z) is (x:-y:z) */ + if (mpz_sgn (e) < 0) + { + negated = 1; + mpz_neg (e, e); + pt_many_neg(tP, nE, n); + } + + if (mpz_cmp_ui (e, 1) == 0) + goto pt_many_mul_end; + + l = mpz_sizeinbase (e, 2) - 1; /* l >= 1 */ + if(l < 32) + status = pt_many_mul_add_sub_si(f, tQ, tP, tE, nE, mpz_get_si(e), n, + num, den, inv, ok); + else + status = pt_many_mul_plain(f, tQ, tP, tE, nE, e, n, num, den, inv, ok); + + +pt_many_mul_end: + + /* Undo negation to avoid changing the caller's e value */ + if (negated){ + mpz_neg (e, e); + pt_many_neg(tP, nE, n); + } + return status; +} + +/* Copied from classical ecm_stage1. */ +int +all_curves_at_once(mpz_t f, char *ok, ell_curve_t *tE, ell_point_t *tP, int nE, + mpmod_t n, double B1, double *B1done, + int (*stop_asap)(void), char *chkfilename) +{ + ell_point_t tQ[NCURVE_MAX], tR[NCURVE_MAX]; + mpz_t num[NCURVE_MAX+1], den[NCURVE_MAX+1], inv[NCURVE_MAX], e; + double p = 0.0, r, last_chkpnt_p; + int ret = ECM_NO_FACTOR_FOUND; + long last_chkpnt_time; + int i; + prime_info_t prime_info; + + mpz_init(e); + for(i = 0; i < nE; i++){ + mpres_init(tQ[i]->x, n); mpres_set(tQ[i]->x, tP[i]->x, n); + mpres_init(tQ[i]->y, n); mpres_set(tQ[i]->y, tP[i]->y, n); + mpres_init(tQ[i]->z, n); mpres_set(tQ[i]->z, tP[i]->z, n); + + mpres_init(tR[i]->x, n); + mpres_init(tR[i]->y, n); + mpres_init(tR[i]->z, n); + + mpres_init(num[i], n); + mpres_init(den[i], n); + mpres_init(inv[i], n); + } + mpres_init(num[nE], n); /* to be used as buffer in compute_all_inverses */ + mpres_init(den[nE], n); /* to be used as buffer in compute_all_inverses */ + + last_chkpnt_time = cputime (); + +#if DEBUG_MULTI_EC >= 2 + printf("Initial points:\n"); + pt_many_print(tP, nE, n); +#endif + for (r = 2.0; r <= B1; r *= 2.0) + if (r > *B1done){ + if(pt_many_duplicate (tQ, tQ, tE, nE, n, num, den, inv, ok) == 0){ + mpz_set(f, num[nE]); + ret = ECM_FACTOR_FOUND_STEP1; + goto end_of_all; + } +#if DEBUG_MULTI_EC >= 2 + printf("P%ld:=", (long)r); pt_many_print(tQ, nE, n); printf(";\n"); +#endif + } + + last_chkpnt_p = 3.; + prime_info_init (prime_info); + for (p = getprime_mt (prime_info); p <= B1; p = getprime_mt (prime_info)){ + for (r = p; r <= B1; r *= p){ +#if DEBUG_MULTI_EC >= 2 + printf("## p = %ld at %ldms\n", (long)p, cputime()); +#endif + if (r > *B1done){ + mpz_set_ui(e, (ecm_uint) p); + if(pt_many_mul(tR, tQ, tE, nE, e, n, num, den, inv, ok) == 0){ + mpz_set(f, num[nE]); + ret = ECM_FACTOR_FOUND_STEP1; + goto end_of_all; + } +#if DEBUG_MULTI_EC >= 2 + pt_many_print(tR, nE, n); +#endif + for(i = 0; i < nE; i++) + if(pt_is_zero(tR[i], n)) + ok[i] = 0; + pt_many_assign(tQ, tR, nE, n); /* TODO: use pointers */ + } + if (stop_asap != NULL && (*stop_asap) ()){ + outputf (OUTPUT_NORMAL, "Interrupted at prime %.0f\n", p); + break; + } + + /* WARNING: not activated yet */ + if (chkfilename != NULL && p > last_chkpnt_p + 10000. && + elltime (last_chkpnt_time, cputime ()) > CHKPNT_PERIOD){ +#if 0 /* TODO: make this work for many curves */ + writechkfile (chkfilename, ECM_ECM, MAX(p, *B1done), n, A, x, y, z); +#endif + last_chkpnt_p = p; + last_chkpnt_time = cputime (); + } + } + } + end_of_all: + /* If stage 1 finished normally, p is the smallest prime > B1 here. + In that case, set to B1 */ + if (p > B1) + p = B1; + + if (p > *B1done) + *B1done = p; + +#if 0 + if (chkfilename != NULL) + writechkfile (chkfilename, ECM_ECM, *B1done, n, A, x, y, z); +#endif + prime_info_clear (prime_info); /* free the prime table */ + + /* put results back */ + pt_many_assign(tP, tQ, nE, n); + /* normalize all points */ + for(i = 0; i < nE; i++) + if(pt_is_zero(tP[i], n)) + pt_set_to_zero(tP[i], n); + /* clear temporary variables */ + mpz_clear(e); + for(i = 0; i < nE; i++){ + mpres_clear(tQ[i]->x, n); + mpres_clear(tQ[i]->y, n); + mpres_clear(tQ[i]->z, n); + mpres_clear(num[i], n); + mpres_clear(den[i], n); + mpres_clear(inv[i], n); + } + mpres_clear(num[nE], n); + mpres_clear(den[nE], n); + return ret; +} + +int +read_and_prepare(mpz_t f ATTRIBUTE_UNUSED, mpz_t x ATTRIBUTE_UNUSED, mpq_t q, + char *buf, mpz_t n ATTRIBUTE_UNUSED) +{ + mpq_set_str(q, buf, 10); +#ifdef HAVE_TORSION + if(mod_from_rat(x, q, n) == 0){ + mpz_set(f, x); + return 0; + } +#endif + return 1; +} + +/* + OUTPUT: ECM_NO_FACTOR_FOUND + ECM_INPUT_NUMBER_FOUND + ECM_PRIME_FAC_PRIME_COFAC + ECM_PRIME_FAC_COMP_COFAC + ECM_COMP_FAC_COMP_COFAC + ECM_COMP_FAC_PRIME_COFAC +*/ +int +process_many_curves(mpz_t f, mpmod_t n, double B1, mpz_t B2, + ell_curve_t *tE, ell_point_t *tP, int nE, + ecm_params params, int onebyone, char *savefilename) +{ + double B1done; + ell_point_t tQ[NCURVE_MAX]; + char *ok = (char *)malloc(nE * sizeof(char)); + int ret = 0, i; + long st = cputime (); + + memset(ok, 1, nE); + if(onebyone != 0){ + ret = one_curve_at_a_time(f, ok, tE, tP, nE, n->orig_modulus, params, + B1, B2, savefilename); + free(ok); + return ret; + } + /* take everybody */ + for(i = 0; i < nE; i++){ + ell_point_init(tQ[i], tE[i], n); + ell_point_set(tQ[i], tP[i], tE[i], n); + } + B1done = 1.0; + ret = all_curves_at_once(f, ok, tE, tQ, nE, n, B1, &B1done, NULL, NULL); + printf("# Step 1 took %ldms\n", elltime (st, cputime ())); + + if(ret != ECM_NO_FACTOR_FOUND){ + ret = conclude_on_factor(n->orig_modulus, f, params->verbose); +#if DEBUG_MULTI_EC >= 2 + if(ret == ECM_PRIME_FAC_PRIME_COFAC || ret == ECM_PRIME_FAC_COMP_COFAC) + /* output Magma lines to check properties of E mod f */ + dump_curves(tE, tP, nE, f); +#endif + } + else{ + params->sigma_is_A = -1; + params->B1done = B1; + for(i = 0; i < nE; i++){ + if(ok[i] == 0) + continue; +#if DEBUG_MULTI_EC >= 1 + printf("# Entering Step 2 for E[%d]\n", i); +#endif + mpres_get_z(tP[i]->x, tQ[i]->x, n); + mpres_get_z(tP[i]->y, tQ[i]->y, n); + mpres_get_z(tP[i]->z, tQ[i]->z, n); + ret = process_one_curve(f, n->orig_modulus, B1, B2, params, + tE[i], tP[i]); + if(ret != ECM_NO_FACTOR_FOUND){ + printf("## factor found in Step 2: "); + mpz_out_str (stdout, 10, f); + printf ("\n"); + ret = conclude_on_factor(n->orig_modulus, f, params->verbose); + break; + } + } + } + for(i = 0; i < nE; i++) + ell_point_clear(tQ[i], tE[i], n); + free(ok); + return ret; +} + +int +read_curves_from_file(int *nE, ell_curve_t *tE, ell_point_t *tP, + mpz_t *tf, int *nf, + mpmod_t n, char *fic_EP, int ncurves) +{ + FILE *ifile = fopen(fic_EP, "r"); + char bufA[1024], bufx[1024], bufy[1024], c, Etype; + mpq_t q; + int ret = ECM_NO_FACTOR_FOUND; + + *nE = 0; + mpq_init(q); + while(fscanf(ifile, "%s", bufA) != EOF){ + if(bufA[0] == '#'){ + /* skip line and print it */ + printf("%s", bufA); + while((c = getc(ifile)) != '\n') + printf("%c", c); + printf("\n"); + continue; + } + else + Etype = bufA[0]; + ell_curve_init(tE[*nE],ECM_EC_TYPE_WEIERSTRASS,ECM_LAW_HOMOGENEOUS,n); + if(Etype == 'W'){ + if(fscanf(ifile, "%s %s %s", bufA, bufx, bufy) == EOF) + break; + tE[*nE]->type = ECM_EC_TYPE_WEIERSTRASS; + tE[*nE]->law = ECM_LAW_AFFINE; + } + else if(Etype == 'H'){ + if(fscanf(ifile, "%s %s %s", bufA, bufx, bufy) == EOF) + break; + tE[*nE]->type = ECM_EC_TYPE_HESSIAN; + } + else if(Etype == 'M'){ + if(fscanf(ifile, "%s %s", bufA, bufx) == EOF) + break; + tE[*nE]->type = ECM_EC_TYPE_MONTGOMERY; + } + else{ + printf("Unknown curve type: %c\n", Etype); + return ECM_ERROR; + } + mpz_init(tE[*nE]->a4); + if(read_and_prepare(tf[*nf], tE[*nE]->a4, q, bufA, n->orig_modulus) == 0){ + ret = 0; + *nf += 1; + goto process_end; + } + ell_point_init(tP[*nE], tE[*nE], n); + mpz_init(tP[*nE]->x); + if(read_and_prepare(tf[*nf], tP[*nE]->x, q, bufx, n->orig_modulus) == 0){ + ret = 0; + *nf+= 1; + goto process_end; + } + mpz_init(tP[*nE]->y); + if((Etype == 'W') || (Etype == 'H')){ + if(read_and_prepare(tf[*nf], tP[*nE]->y, q, bufy, n->orig_modulus) == 0){ + ret = 0; + *nf+= 1; + goto process_end; + } + } + mpz_init_set_ui(tP[*nE]->z, 1); + *nE += 1; + if(ncurves != 0 && *nE == ncurves) + break; + } + process_end: + fclose(ifile); + mpq_clear(q); + return ret; +} + +/* + OUTPUT: ECM_NO_FACTOR_FOUND + ECM_PRIME_FAC_PRIME_COFAC + ECM_PRIME_FAC_COMP_COFAC + ECM_COMP_FAC_COMP_COFAC + ECM_COMP_FAC_PRIME_COFAC + One ring to run them all. +*/ +int +process_many_curves_loop(mpz_t tf[], int *nf, mpz_t n, double B1, mpz_t B2, + ecm_params params, + char *fic_EP, + char *torsion, int smin, int smax, int nE, + int disc, mpz_t *sqroots, + char *savefilename) +{ + ell_curve_t tE[NCURVE_MAX]; + ell_point_t tP[NCURVE_MAX]; + mpmod_t modulus; + int ret = 0, i, onebyone; + + onebyone = 1; /* mtyform; */ + while(1){ + /* cheating with the content of tE and tP that are first defined + over Z/nZ without residues + */ + mpmod_init(modulus, n, ECM_MOD_DEFAULT); + if(fic_EP != NULL) + ret = read_curves_from_file(&nE, tE, tP, tf, nf, modulus, + fic_EP, nE); +#ifdef HAVE_TORSION + else if(torsion != NULL) + ret = build_curves_with_torsion(tf[*nf],modulus,tE,tP, + torsion,smin,smax,nE,disc,sqroots); +#endif + else if(disc != 0){ +#if 0 + ret = build_curves_with_CM(tf[*nf],&nE,tE,tP,disc,modulus,sqroots); +#else + printf("Sorry, disabled right now!\n"); + exit(-1); +#endif + } + if(ret == ECM_NO_FACTOR_FOUND) + ret = process_many_curves(tf[*nf],modulus,B1,B2,tE,tP,nE,params, + onebyone,savefilename); + else{ + printf("Quid? %d\n", ret); + break; + } + /* clear curves */ + for(i = 0; i < nE; i++){ + ell_point_clear(tP[i], tE[i], modulus); + ell_curve_clear(tE[i], modulus); + } + mpmod_clear(modulus); + /* inspect result */ + if(ret == ECM_PRIME_FAC_PRIME_COFAC){ + *nf += 1; + break; + } + else if(ret == ECM_PRIME_FAC_COMP_COFAC){ + printf("# start again with n/f\n"); + mpz_tdiv_q(n, n, tf[*nf]); + *nf += 1; + } + else if(ret == ECM_COMP_FAC_PRIME_COFAC){ + mpz_t C; + + printf("# start again with f\n"); + mpz_init(C); + mpz_tdiv_q(C, n, tf[*nf]); + mpz_set(n, tf[*nf]); + mpz_set(tf[*nf], C); + mpz_clear(C); + *nf += 1; + } + else if(ret == ECM_COMP_FAC_COMP_COFAC){ + mpz_t f; + + mpz_init_set(f, tf[*nf]); + /* update n right now */ + mpz_tdiv_q(n, n, f); + gmp_printf("# recursive call for f=%Zd\n", f); + process_many_curves_loop(tf, nf, f, B1, B2, params, fic_EP, + torsion, smin, smax, nE, + disc, sqroots, savefilename); + /* there is always some cofactor to store */ + mpz_set(tf[*nf], f); + *nf += 1; + printf("# start again with n/f\n"); + } + else /* something happened */ + break; + } + return ret; +} + +/* Assume b^n = 1 mod N. + status = 0 if the squareroot could not be computed, + -1 if the check is bad, + 1 otherwise. + */ +int +odd_square_root_mod_N(mpz_t f, int *status, mpz_t *sqroots, + int b, int n, int q, mpz_t N) +{ + mpz_t zeta, tmp, tmp2; + int np = n, e = 0, *tab, x, ret = ECM_NO_FACTOR_FOUND; + + *status = 1; + while(np % q == 0){ + e++; + np /= q; + } + /*printf("# n = %d = %d^%d * %d\n", n, q, e, np);*/ + mpz_init_set_ui(zeta, b); + mpz_powm_ui(zeta, zeta, np, N); + if(mpz_cmp_ui(zeta, 1) == 0){ + printf("# missed: zeta == 1\n"); + *status = 0; + } + else{ + /* look for k s.t. zeta^{q^k} = 1 */ + mpz_init_set(tmp, zeta); + do{ + mpz_set(zeta, tmp); + mpz_powm_ui(tmp, tmp, q, N); + } while(mpz_cmp_ui(tmp, 1) != 0); + /* gmp_printf("# zeta_%d = %Zd\n", q, zeta);*/ + mpz_sub_si(f, zeta, 1); + mpz_gcd(f, f, N); + if(mpz_cmp_ui(f, 1) != 0){ + printf("# Factor found (gcd(zeta_%d-1, N)): ", q); + mpz_out_str(stdout, 10, f); + printf("\n"); + ret = ECM_FACTOR_FOUND_STEP1; + goto end_of_odd_sqrt; + } + /* compute eta0 = sum zeta^R */ + tab = (int *)malloc((q+1) * sizeof(int)); + memset(tab, 0, (q+1) * sizeof(int)); + for(x = 1; x < q; x++) + tab[(x*x) % q] = 1; + mpz_set_ui(tmp, 0); + mpz_init(tmp2); + for(x = 1; x < q; x++) + if(tab[x] == 1){ + mpz_powm_ui(tmp2, zeta, x, N); + mpz_add(tmp, tmp, tmp2); + } + mpz_add(tmp, tmp, tmp); + mpz_add_ui(tmp, tmp, 1); + mpz_mod(tmp, tmp, N); + mpz_mul(tmp2, tmp, tmp); + if(q % 4 == 1){ + gmp_printf("# sqrt(%d) = %Zd\n", q, tmp); + mpz_sub_si(tmp2, tmp2, q); + } + else{ + gmp_printf("# sqrt(-%d) = %Zd\n", q, tmp); + mpz_add_si(tmp2, tmp2, q); + } + mpz_mod(tmp2, tmp2, N); + if(mpz_sgn(tmp2) == 0) + mpz_init_set(sqroots[0], tmp); + else{ + gmp_printf("Bad check: %Zd\n", tmp2); + gmp_printf("N:=%Zd;\n", N); + *status = -1; + } + mpz_clear(tmp); + mpz_clear(tmp2); + free(tab); + } + end_of_odd_sqrt: + mpz_clear(zeta); + return ret; +} + +/* b^(2*k) = -1 => (b^k)^2 = -1 mod N */ +static void +psb_minus_even(int *tsq, mpz_t sqroots[], int b, int k, mpz_t N) +{ + int isq = 0; + + /* printf("# got sqrt(-1)\n");*/ + tsq[isq++] = -1; + mpz_init_set_si(sqroots[0], b); + mpz_powm_ui(sqroots[0], sqroots[0], k, N); + if(k % 2 == 0){ + /* zeta8 = b^(k/2) = (1+zeta4)/sqrt(2) + => sqrt(2) = (1+zeta4)/zeta8 */ + /* printf("# got sqrt(2)\n");*/ + mpz_init_set_si(sqroots[1], b); + mpz_powm_ui(sqroots[1], sqroots[1], k>>1, N); + mpz_invert(sqroots[1], sqroots[1], N); + mpz_add_si(sqroots[0], sqroots[0], 1); + mpz_mul(sqroots[1], sqroots[1], sqroots[0]); + mpz_mod(sqroots[1], sqroots[1], N); + mpz_sub_si(sqroots[0], sqroots[0], 1); + tsq[isq++] = 2; + } + tsq[isq] = 0; +} + +/* b^(2*k+1) = -1 => (b^(k+1))^2 = -b mod N */ +static void +psb_minus_odd(int *tsq, mpz_t sqroots[], int b, int k, mpz_t N) +{ + /* printf("# got sqrt(-%d)\n", b);*/ + tsq[0] = -b; + mpz_init_set_si(sqroots[0], b); + mpz_powm_ui(sqroots[0], sqroots[0], k+1, N); +} + +/* b^(2*k+1) = 1 mod N => (b^(k+1))^2 = b mod N */ +static void +psb_plus_odd(int *tsq, mpz_t sqroots[], int b, int k, mpz_t N) +{ + /* printf("# got sqrt(%d)\n", b);*/ + mpz_init_set_si(sqroots[0], b); + mpz_powm_ui(sqroots[0], sqroots[0], k+1, N); + tsq[0] = b; +} + +/* N | b^n+c + OUTPUT: ECM_NO_FACTOR_FOUND or ECM_FACTOR_FOUND_STEP1 in very rare cases! +*/ +static int +prepare_squareroots_from_powers(mpz_t f, int *tsq, mpz_t sqroots[], + int b, int n, int c, mpz_t N) +{ + int k, ret = ECM_NO_FACTOR_FOUND; + mpz_t tmp, tmp2; + + tsq[0] = 0; + tsq[1] = 0; + if(c == -1){ + /* b^n = 1 mod N */ + if(n % 2 == 0){ + /* b^(2*k) = 1 => try to find smallest power */ + k = n >> 1; + while(k % 2 == 0) + k >>= 1; + mpz_init_set_si(tmp, b); + mpz_powm_ui(tmp, tmp, k, N); + mpz_init_set_si(tmp2, 0); + while(mpz_cmp_ui(tmp, 1) != 0){ + mpz_set(tmp2, tmp); + mpz_mul(tmp, tmp, tmp); + mpz_mod(tmp, tmp, N); + k <<= 1; + } + /* at this point, b^k == 1 */ + gmp_printf("# %d^%d = 1 mod %Zd;\n", b, k, N); + if(k % 2 == 1) + /* b^(2*r+1) = 1 mod N => (b^(r+1))^2 = b mod N */ + psb_plus_odd(tsq, sqroots, b, k>>1, N); + else{ + /* b^(2*r) = 1 */ + mpz_add_si(tmp2, tmp2, 1); + if(mpz_cmp(tmp2, N) == 0){ + /* case b^r = -1 */ + printf("# %d^%d = -1 mod N;\n", b, k>>1); + if(k % 4 == 0) + /* b^(2*s) = -1 */ + psb_minus_even(tsq, sqroots, b, k>>2, N); + else + /* b^(2*s+1) = -1 */ + psb_minus_odd(tsq, sqroots, b, k>>2, N); + } + else{ + /* we have a factor, since tmp2^2 = 1, tmp2 != -1, +1 */ + mpz_sub_si(tmp2, tmp2, 1); + mpz_gcd(f, tmp2, N); + gmp_printf("Factor!! %Zd\n", f); + return ECM_FACTOR_FOUND_STEP1; + } + } + mpz_clear(tmp); + mpz_clear(tmp2); + } + else + /* b^(2*k+1) = 1 mod N => (b^(k+1))^2 = b mod N */ + psb_plus_odd(tsq, sqroots, b, n>>1, N); + } + else if(c == 1){ + /* b^n = -1 mod N */ + if(n % 2 == 0) + /* b^(2*k) = -1 mod N => (b^k)^2 = -1 mod N */ + psb_minus_even(tsq, sqroots, b, n>>1, N); + else + /* b^(2*k+1) = -1 mod N => (b^(k+1))^2 = -b mod N */ + psb_minus_odd(tsq, sqroots, b, n>>1, N); + } + else{ + /* b^n = -c mod N */ + if(n % 2 == 0){ + /* (b^k)^2 = -c */ + tsq[0] = -c; /* FIXME: case c non squarefree? */ + mpz_init_set_si(sqroots[0], b); + mpz_powm_ui(sqroots[0], sqroots[0], n>>1, N); + } + else{ + /* (b^(k+1))^2 = -c*b */ + tsq[0] = -c*b; /* FIXME: case c non squarefree? */ + mpz_init_set_si(sqroots[0], b); + mpz_powm_ui(sqroots[0], sqroots[0], (n+1) >>1, N); + } + } + return ret; +} + +/* N is a cofactor of b^n+c. */ +static int +prepare_squareroots(mpz_t f, int *tsq, mpz_t sqroots[], + int b, int n, int c, mpz_t N) +{ + int ret, tabq[] = {3, 5, 7, 11, 13, 19, 0}, q, iq, qs, isq, nn, status; + + tsq[0] = 0; + ret = prepare_squareroots_from_powers(f,tsq,sqroots,b,n,c,N); + if(ret != ECM_NO_FACTOR_FOUND) + return ret; + printf("# I already found squareroots for:"); + for(isq = 0; tsq[isq] != 0; isq++) + printf(" %d", tsq[isq]); + printf("\n"); + /* let's find some squareroots using small odd prime divisors of n */ + if(abs(c) == 1){ + for(iq = 0; tabq[iq] != 0; iq++){ + q = tabq[iq]; + qs = (q % 4 == 1 ? q : -q); + if(n % q == 0){ + /* printf("# I can find sqrt(%d)\n", qs);*/ + /* make sure that b^nn = 1 */ + nn = (c == -1 ? n : 2*n); + ret = odd_square_root_mod_N(f,&status,sqroots+isq,b,nn,q,N); + if(ret != ECM_NO_FACTOR_FOUND) + break; + if(status == 1) + tsq[isq++] = qs; + } + } + tsq[isq] = 0; + } + return ret; +} + +static int +rebuild_squareroot(mpz_t sq2[], int *tsq, mpz_t sqroots[], int *tqs, mpz_t N) +{ + mpz_t tmp; + int isq, iqs, disc = tqs[0], ret, qs; + + mpz_set_ui(sq2[0], 1); + for(iqs = 1; tqs[iqs] != 0; iqs++){ + for(isq = 0; tsq[isq] != 0; isq++){ + if(tsq[isq] == -1) + qs = -4; + else if(tsq[isq] == -2) + qs = -8; + else + qs = tsq[isq]; + if(qs == tqs[iqs]){ + disc /= qs; + mpz_mul(sq2[0], sq2[0], sqroots[isq]); + mpz_mod(sq2[0],sq2[0], N); + break; + } + } + } + if(disc != 1){ + printf("#!# Pb: disc != 1: %d\n", disc); + return 0; + } + /* check */ + disc = tqs[0]; + if(disc % 4 == 0) disc >>= 2; + mpz_init_set(tmp, sq2[0]); + mpz_mul(tmp, tmp, sq2[0]); + mpz_sub_si(tmp, tmp, disc); + mpz_mod(tmp, tmp, N); + if(mpz_sgn(tmp) == 0){ + printf("# good check for sqrt(%d)\n", tqs[0]); + ret = 1; + } + else{ + printf("# bad check for sqrt(%d)\n", tqs[0]); + ret = 0; + } + mpz_clear(tmp); + return ret; +} + +/* Consider M = b^n+1 if n > 0, M = b^(-n)-1 otherwise. + N is supposed to be a *primitive* cofactor of M. + Then find a special cocktail of CM curves a` la Atkin. + To solve the B1 problem, only consider (b, n)'s s.t. disc(b, n) = discref. + + When torsion != NULL, this means we are using some curves over + Q(sqrt(discref)). + + */ +int +process_special_blend(mpz_t tf[], int *nf, int *tried, + mpz_t N, int b, int n, int c, double B1, mpz_t B2, + ecm_params params, char *savefilename, + int discref, + char *torsion, int smin, int smax, int ncurves) +{ + int i; + int ret = ECM_NO_FACTOR_FOUND; + int tabd[][4] = {{-3, -3, 0, 0}, {-4, -4, 0, 0}, {-7, -7, 0, 0}, + {-8, -8, 0, 0}, {-11, -11, 0, 0}, + /* h = g = 2 */ + {-15, -3, 5, 0}, +#if 0 + {-20, -4, 5, 0}, {-24, 8, -3, 0}, + {-35, 5, -7, 0}, {-40, -8, 5, 0}, {-51, -3, 17, 0}, + {-52, -4, 13, 0}, {-88, 8, -11, 0}, + {-91, -7, 13, 0}, {-115, 5, -23, 0}, {-123, -3, 41, 0}, + {-148, 0, 0, 0}, {-187, 0, 0, 0}, + {-232, 0, 0, 0}, {-235, 0, 0, 0}, + {-267, 0, 0, 0}, {-403, 0, 0, 0}, {-427, 0, 0, 0}, + /* h = g = 4 */ + 84, 120, 132, 168, 195, 228, 280, 312, 340, 372, 408, + 435, 483, 520, 532, 555, 595, 627, 708, 715, 760, 795, + 1012, 1435, + /* h = g = 8 */ + 420, 660, 840, 1092, 1155, 1320, 1380, 1428, 1540, 1848, + 1995, 3003, 3315, + /* h = g = 16 */ + 5460 +#endif + {0, 0, 0, 0} + }; + mpz_t sqroots[10], sqd[10]; + int tsq[10], disc; + + tsq[0] = 0; + ret = prepare_squareroots(tf[0], tsq, sqroots, b, n, c, N); + if(ret != ECM_NO_FACTOR_FOUND) + return conclude_on_factor(N, tf[0], 1); + if(torsion != NULL){ + if(tsq[0] != discref) + printf("#W# tsq[0]=%d != discref\n", tsq[0]); + else{ + printf("# Using curves with torsion %s and disc=%d", + torsion, discref); + gmp_printf(" together with B1=%1.0f B2=%Zd\n", B1, B2); + *tried = 1; + ret = process_many_curves_loop(tf, nf, N, B1, B2, params, NULL, + torsion, smin, smax, ncurves, + discref, sqroots, savefilename); + /* TODO: improve this? */ + } + return ret; + } + mpz_init_set_ui(sqd[0], 1); + for(i = 0; tabd[i][0] != 0; i++){ + disc = tabd[i][0]; + if(disc != discref || n % abs(disc) != 0) + continue; + /* rebuild sqrt(disc) */ + if(rebuild_squareroot(sqd, tsq, sqroots, tabd[i], N)){ + printf("# Using CM curves with disc=%d", disc); + gmp_printf(" together with B1=%1.0f B2=%Zd\n", B1, B2); + *tried = 1; + ret = process_many_curves_loop(tf, nf, N, B1, B2, params,NULL, + NULL, 0, 0, 1, + disc, sqd, savefilename); + if(ret != ECM_NO_FACTOR_FOUND) + break; + } + } + mpz_clear(sqd[0]); + for(i = 0; tsq[i] != 0; i++) + mpz_clear(sqroots[i]); + return ret; +} + +/* for N | b^n+c */ +static char * +best_M_d(int *disc, int b, int n, int c) +{ + *disc = 0; + if(c == -1) + /* b^(2*k+1) = 1 mod N => (b^(k+1))^2 = b mod N */ + *disc = b; + else if(c == 1 && (n % 2 == 1)) + /* b^(2*k+1) = -1 mod N => (b^(k+1))^2 = -b mod N */ + *disc = -b; + else{ /* FIXME: squarefree part of -c or -b*c */ + if(n % 2 == 0) + *disc = -c; + else + *disc = -b*c; + } +#ifdef HAVE_TORSION + /* TODO: case of b with a square prime factor */ + if(*disc != 0){ + int i, M = -1, Mi, di; + for(i = 0; strcmp(XM_data[i][0] , "0") != 0; i++){ + Mi = atoi(XM_data[i][0]); + di = atoi(XM_data[i][1]); + if(di == *disc) + M = Mi; + } + if(M != -1){ + char *tmp = (char *)malloc(4 * sizeof(char)); + sprintf(tmp, "Z%d", M); + return tmp; + } + } +#endif + return NULL; +} + +static void +usage(char *cmd) +{ + printf("Usage: %s -inp file_N -B1 B1 -B2 B2 -curves file_C", cmd); + printf(" -torsion T -smin smin -smax smax\n"); + printf(" -inp file_N numbers to be factored, one per line\n"); + printf(" file_N can be '-', in which case stdin is used\n"); + printf(" -curves file_C curves to be used, format '[M|W|H] A x0 y0' per line\n"); + printf(" M=Montgomery, W=Weierstrass, H=Hessian\n"); + printf(" -disc D uses CM curves with discriminant D\n"); + printf(" -b b for numbers b^n+/-1 (activates some special code; b=1 for any b in the file)\n"); + printf(" -format format where format = \"bn\" or \"plain\" (default)\n"); + printf(" -X1 select best X1(M) for b^n+/-1\n"); + printf(" -h, --help Prints this help and exit.\n"); +} + +#define NFMAX 100 + +int +main(int argc, char *argv[]) +{ + mpz_t N, tf[NFMAX], B2; + int res = 0, smin = -1, smax = -1, ncurves = 0, method = ECM_ECM, tried; + int nf = 0, i, bb = 0; + double B1 = 0.0, dB2 = 0.0; + int disc = 0, b = 0, n = 0, useX1 = 0, c; + char *infilename = NULL, *curvesname = NULL, *torsion = NULL; + char buf[10000], ch; + FILE *infile = NULL; + char *savefilename = NULL, format[20]; + ecm_params params; + int ret; + + /* print args */ + sprintf(format, "plain"); + printf("# ARGS: %s", argv[0]); + for(i = 1; i < argc; i++) + printf(" %s", argv[i]); + printf("\n"); + mpz_init_set_si(B2, ECM_DEFAULT_B2); + /* look for options */ + while ((argc > 1) && (argv[1][0] == '-')){ + if (strcmp (argv[1], "-h") == 0 || strcmp (argv[1], "--help") == 0){ + usage (argv[0]); + exit (EXIT_SUCCESS); + } + else if ((argc > 2) && (strcmp (argv[1], "-B1") == 0)){ + B1 = atof(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-B2") == 0)){ + dB2 = atof(argv[2]); + mpz_set_d(B2, dB2); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-inp") == 0)){ + infilename = argv[2]; + if(strcmp(infilename, "-") == 0) + infile = stdin; + else{ + infile = fopen (infilename, "r"); + if (!infile){ + fprintf (stderr, "Can't find input file %s\n", infilename); + exit (EXIT_FAILURE); + } + } + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-curves") == 0)){ + curvesname = argv[2]; + argv += 2; + argc -= 2; + } + /* one may restrict the number of curves used from file */ + else if ((argc > 2) && (strcmp (argv[1], "-ncurves") == 0)){ + ncurves = atoi(argv[2]); + argv += 2; + argc -= 2; + } + /** torsion related parameters **/ + else if ((argc > 2) && (strcmp (argv[1], "-torsion") == 0)){ + torsion = argv[2]; + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-smin") == 0)){ + smin = atoi(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-smax") == 0)){ + smax = atoi(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-disc") == 0)){ + disc = atoi(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-b") == 0)){ + b = atoi(argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-format") == 0)){ + sprintf(format, "%s", argv[2]); + argv += 2; + argc -= 2; + } + else if ((argc > 2) && (strcmp (argv[1], "-save") == 0)){ + savefilename = argv[2]; + argv += 2; + argc -= 2; + } + else if (strcmp (argv[1], "-pm1") == 0){ + method = ECM_PM1; + argv++; + argc--; + } + else if (strcmp (argv[1], "-X1") == 0){ + useX1 = 1; + argv++; + argc--; + } + else if (strcmp (argv[1], "-pp1") == 0){ + method = ECM_PP1; + argv++; + argc--; + } + else{ + fprintf (stderr, "Unknown option: %s\n", argv[1]); + exit (EXIT_FAILURE); + } + } + if(infile == NULL){ + fprintf (stderr, "No input file given\n"); + exit (EXIT_FAILURE); + } + if(curvesname == NULL && torsion == NULL && disc == 0 && b == 0 + && useX1 == 0){ + fprintf (stderr, "Not enough parameters\n"); + exit (EXIT_FAILURE); + } + if(curvesname != NULL && torsion != NULL){ + fprintf (stderr, "Cannot have -curves and -torsion at the same time.\n"); + exit (EXIT_FAILURE); + } + if(torsion != NULL && ncurves == 0){ + fprintf (stderr, "You must provide ncurves != 0 with -torsion.\n"); + exit (EXIT_FAILURE); + } + if(ncurves > NCURVE_MAX){ + fprintf(stderr, "Too many curves: %d\n", ncurves); + exit (EXIT_FAILURE); + } + + if(torsion != NULL || useX1){ + if(disc == 0) + printf("# GMP-ECM [torsion=%s:%d-%d]\n", torsion, smin, smax); + else + printf("# GMP-ECM [torsion=%s:%d-%d;d=%d]\n", + torsion, smin, smax, disc); + } + else if(disc != 0){ + printf("# GMP-ECM [CM=%d]\n", disc); + ncurves = 1; /* FIXME */ + } + else if(useX1) + printf("# GMP-ECM [X1:%d-%d]\n", smin, smax); + + mpz_init (N); + for(i = 0; i < NFMAX; i++) + mpz_init(tf[i]); /* for potential factors */ + ecm_init(params); +#if DEBUG_MULTI_EC >= 2 + params->verbose = 2; +#else + /* params->verbose = OUTPUT_DEVVERBOSE;*/ + params->verbose = OUTPUT_NORMAL; +#endif +#if MULTI_USE_ADD_SUB + if(torsion != NULL || useX1) + compute_s_4_add_sub(params->batch_s, (ecm_uint)B1, 0); + else + compute_s_4_add_sub(params->batch_s, (ecm_uint)B1, disc); +#endif + while(fscanf(infile, "%s", buf) != EOF){ + /* read number */ + if(buf[0] == '#'){ + /* print till end of line */ + printf("%s", buf); + while((ch = getc(infile)) != '\n') + printf("%c", ch); + printf("\n"); + continue; + } + if(strcmp(format, "bn") == 0 || strcmp(format, "bnc") == 0){ + /* line should be: "b n[+/-/L/M] N" + or "b n[+/-]c N" + */ + bb = atoi(buf); + /* decode */ + ret = fscanf(infile, "%s", buf); + ASSERT_ALWAYS(ret != EOF); + if(strcmp(format, "bn") == 0){ + /* buf = "n[+/-/L/M]" */ + ch = buf[strlen(buf)-1]; + buf[strlen(buf)-1] = '\0'; + n = atoi(buf); + c = 1; + if(ch == '-') + c = -1; + else if(ch == 'L' || ch == 'M'){ + if(bb == 5) + c = -1; + } + else if(ch != '+'){ + printf("#!# unknown suffix: %c\n", ch); + break; + } +#if DEBUG_MULTI_EC >= 2 + printf("# I read: b=%d n=%d c=%d\n", bb, n, c); +#endif + } + else if(strcmp(format, "bnc") == 0){ + /* buf = "n[+/-]c" */ + sscanf(buf, "%d%c%d", &n, &ch, &c); + if(ch == '-') + c = -c; +#if DEBUG_MULTI_EC >= 2 + printf("# I read: b=%d n=%d c=%d\n", bb, n, c); +#endif + } + /* read N */ + ret = fscanf(infile, "%s", buf); + ASSERT_ALWAYS(ret != EOF); + if((b > 1) && (bb != b)) + continue; + if(useX1){ + /* select best (M, d) */ + torsion = best_M_d(&disc, bb, n, c); + if(torsion == NULL){ + printf("# no level found for this number, sorry\n"); + continue; + } + } + } + if(mpz_set_str (N, buf, 10)){ + fprintf (stderr, "Invalid number: %s\n", argv[1]); + exit (1); + } + res = ECM_NO_FACTOR_FOUND; + tried = 0; + if(method == ECM_ECM){ + nf = 0; + if((strcmp(format, "bn") == 0 || strcmp(format, "bnc") == 0) + && disc != 0){ + res = process_special_blend(tf,&nf,&tried,N,bb,n,c,B1,B2, + params,savefilename,disc, + torsion, smin, smax, ncurves); + } + if(res == ECM_NO_FACTOR_FOUND && !tried + && (curvesname != NULL || torsion != NULL || disc != 0)){ + res = process_many_curves_loop(tf, &nf, N, B1, B2, params, + curvesname, + torsion, smin, smax, ncurves, + disc, NULL, + savefilename); + } + } +#if DEBUG_MULTI_EC >= 2 + printf("List of factors:\n"); + for(i = 0; i < nf; i++) + gmp_printf("%Zd\n", tf[i]); +#endif + } + ecm_clear(params); + if(infile != stdin) + fclose(infile); + for(i = 0; i < NFMAX; i++) + mpz_clear(tf[i]); + mpz_clear(N); + mpz_clear(B2); + return res; +} diff -Nru gmp-ecm-7.0.4+ds/NEWS gmp-ecm-7.0.5+ds/NEWS --- gmp-ecm-7.0.4+ds/NEWS 2016-08-26 13:16:29.000000000 +0000 +++ gmp-ecm-7.0.5+ds/NEWS 2022-06-06 14:16:49.000000000 +0000 @@ -1,3 +1,13 @@ +Changes between GMP-ECM 7.0.4 and 7.0.5: +* fixed a bug when input is a file with several numbers: the -param choice + did depend on the first number +* fixed cross-compiling issue to mingw32 + (https://gitlab.inria.fr/zimmerma/ecm/-/issues/21836) +* fixed issue with Apple Xcode 12 + (https://gitlab.inria.fr/zimmerma/ecm/-/issues/21856) +* fixed OpenMP error when compiling with clang + (https://gitlab.inria.fr/zimmerma/ecm/-/issues/21857) + Changes between GMP-ECM 7.0.3 and 7.0.4: * fixed a bug in mpres_pow: https://gforge.inria.fr/tracker/?func=detail&atid=623&aid=20712&group_id=135 @@ -64,7 +74,7 @@ * GMP-ECM is now distributed under the GPL version 3 or later for the binary, and under the LGPL version 3 or later for the library * Fixed a speed regression with respect to GMP-ECM 6.3 - http://lists.gforge.inria.fr/pipermail/ecm-discuss/2012-February/004103.html + https://sympa.inria.fr/sympa/arc/ecm-discuss/2012-02/msg00000.html * Fixed a bug with the -treefile option which had been present for a long time * Several fixes for the Visual Studio 2010 build * New experimental option -batch=2, and speedup for -batch (i.e., -batch=1) diff -Nru gmp-ecm-7.0.4+ds/nodist/countsmooth.c gmp-ecm-7.0.5+ds/nodist/countsmooth.c --- gmp-ecm-7.0.4+ds/nodist/countsmooth.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/nodist/countsmooth.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,801 @@ +/* Enumerate smooth values from a range of numbers + (with Brent-Suyama extension) + + Copyright 2003 Alexander Kruppa. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; see the file COPYING. If not, write to the Free + Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + 02111-1307, USA. +*/ + +#include +#include +#include +#include +#include + +/* uncomment the following to use primegen, + cf http://cr.yp.to/primegen/primegen-0.97.tar.gz */ +/* #define PRIMEGEN */ + +#ifdef PRIMEGEN +#include +#else +#include "ecm-impl.h" /* for getprime() */ +#endif + +#define mulmod(r,u,v,n) mpz_mul(r,u,v);mpz_mod(r,r,n); + +#define DEBUG + +void dicksonmod(mpz_t, mpz_t, unsigned int, mpz_t, mpz_t); +int is_P_minus_i (mpz_t, unsigned int); +unsigned long eulerphi (unsigned long); +unsigned int get_lenF (unsigned int); +unsigned long gcd (unsigned long, unsigned long); +void quicksort (mpz_t *, unsigned int); +void quicksort_with_index (mpz_t *, unsigned int *, unsigned int); +int issorted (mpz_t *, unsigned int); +int getparm_ui (int, char **, int, char *, unsigned int *); +int getparm_d (int, char **, int, char *, double *); +int getparm_mpz (int, char **, int, char *, mpz_t *); +int brent_suyama_match (mpz_t, unsigned int, unsigned int, + unsigned int, unsigned int, mpz_t, mpz_t *, mpz_t *, + unsigned int *, unsigned int *); +double brent_suyama_theo (mpz_t, unsigned int, unsigned int, + unsigned int, unsigned int, mpz_t); +void print_help (void); + +/* Computes Dickson_{n, a}(x), the degree n Dickson polynomial with + parameter a, evaluated at point x, and returns value modulo p in r */ +void dicksonmod(mpz_t r, mpz_t x, unsigned int n, mpz_t a, + mpz_t p) +{ + unsigned int i, b = 0; + mpz_t t, u; + + mpz_init(t); + mpz_init(u); + + if (n == 0) + { + mpz_set_ui (r, 1); + return; + } + + if (n == 1) + { + mpz_set (r, x); + return; + } + + /* Now n >= 2 */ + while (n > 2 && (n & 1) == 0) + { + b++; + n >>= 1; + } + + mpz_mul (t, x, x); /* r = x^2 */ + mpz_sub (t, t, a); + mpz_sub (t, t, a); /* r = x^2 - 2*a = Dickson_{2,a}(x) */ + mpz_mod (r, t, p); + mpz_set (t, x); /* t = Dickson_{1,a}(x) */ + + for (i = 2; i < n; i++) + { + mulmod (u, t, a, p); /* u = a * Dickson_{i-1,a}(x) */ + mpz_set(t, r); /* t = Dickson_{i,a}(x) */ + mpz_mul (r, r, x); /* r = x * Dickson_{i,a}(x) */ + mpz_sub (r, r, u); /* r = x * Dickson_{i,a}(x) - a * Dickson_{i-1,a}(x) */ + mpz_mod (r, r, p); /* = Dickson_{i+1,a}(x) */ + } + + for ( ; b > 0; b--) + { + mulmod (t, r, r, p); /* t = (Dickson_{n,a}(x))^2 */ + mpz_powm_ui (u, a, n, p); + mpz_mul_2exp (u, u, 1); + mpz_sub (r, t, u); /* r = (Dickson_{n,a}(x))^2 - 2 * a^n */ + mpz_mod (r, r, p); /* = Dickson_{2*n,a}(x) */ + } + + mpz_clear (u); + mpz_clear (t); +} + + +/* Test if N+i is a probable prime */ + +int +is_P_minus_i (mpz_t N, unsigned int i) +{ + int r; + + mpz_add_ui (N, N, i); + r = mpz_probab_prime_p (N, 2); + mpz_sub_ui (N, N, i); + + return r; +} + + +/* Euler Phi(n) function, number of residues coprime to n */ +unsigned long +eulerphi (unsigned long n) +{ + unsigned long i, r=1; + + for (i=2; i*i<=n; i++) + if (n%i == 0) + { + r *= i-1; + n /= i; + while (n%i == 0) + { + r *= i; + n /= i; + } + } + + if (n>1) r *= n-1; + + return r; +} + +unsigned int +get_lenF (unsigned int D) +{ + return (eulerphi (D) / 2); +} + +unsigned long +gcd (unsigned long a, unsigned long b) +{ + unsigned long t; + + while (b != 0) { + t = a % b; + a = b; + b = t; + } + + return a; +} + +#define swap(a,b,t) {(t)=(a);(a)=(b);(b)=(t);} + +/* Sorts list of mpz_t values in a of length len. pivot is a temp var */ + +void +quicksort_with_index (mpz_t *data, unsigned int *index, unsigned int len) +{ + unsigned int i, j, t; + + if (len <= 1) return; + + if (len == 2) + { + if (mpz_cmp (data[0], data[1]) > 0) + { + mpz_swap (data[0], data[1]); + swap(index[0],index[1],t); + } + return; + } + + i = 0; j = len; + while (1) + { /* Top half gets everything greater than pivot */ + while (mpz_cmp(data[++i], data[0]) <= 0 && i < j); + while (mpz_cmp(data[--j], data[0]) > 0 && i < j); + if (i >= j) break; + + mpz_swap (data[i], data[j]); + swap(index[i],index[j],t); + } + + mpz_swap (data[0], data[i-1]); + swap(index[0],index[i-1],t); + + quicksort_with_index (data, index, i-1); + quicksort_with_index (data+i, index+i, len-i); +} + +#ifdef NO_INDEX + +/* Sorts list of mpz_t values in a of length len. pivot is a temp var */ + +void +quicksort (mpz_t *data, unsigned int len) +{ + unsigned int i, j; + + if (len <= 1) return; + + if (len == 2) + { + if (mpz_cmp (data[0], data[1]) > 0) + mpz_swap (data[0], data[1]); + return; + } + + i = 0; j = len; + while (1) + { /* Top half gets everything greater than pivot */ + while (mpz_cmp(data[++i], data[0]) <= 0 && i < j); + while (mpz_cmp(data[--j], data[0]) > 0 && i < j); + if (i >= j) break; + + mpz_swap (data[i], data[j]); + } + + mpz_swap (data[0], data[i-1]); + + quicksort (data, i-1); + quicksort (data+i, len-i); +} + +#endif + +int +issorted (mpz_t *a, unsigned int len) +{ + unsigned int i; + for (i = 0; i < len - 1; i++) + { + if (mpz_cmp (a[i], a[i+1]) > 0) + return 0; + } + + return 1; +} + +int +getparm_ui (int argc, char **argv, int i, char *parm, unsigned int *res) +{ + if (strcmp(argv[i], parm) == 0) { + i++; + if (i >= argc) { + printf("%s needs parameter\n", parm); + exit(EXIT_FAILURE); + } + *res = atoi (argv[i]); + return 1; + } + + return 0; +} + +int +getparm_d (int argc, char **argv, int i, char *parm, double *res) +{ + if (strcmp(argv[i], parm) == 0) { + i++; + if (i >= argc) { + printf("%s needs parameter\n", parm); + exit(EXIT_FAILURE); + } + *res = atof (argv[i]); + return 1; + } + + return 0; +} + +int +getparm_mpz (int argc, char **argv, int i, char *parm, mpz_t *res) +{ + if (strcmp(argv[i], parm) == 0) { + i++; + if (i >= argc) { + printf("%s needs parameter\n", parm); + exit(EXIT_FAILURE); + } + + if (index (argv[i], 'e')) + { /* Hmm, looks like scientific notation */ + double t; + t = atof (argv[i]); + mpz_set_d (*res, t); + mpz_add_ui (*res, *res, 1); /* Avoid getting a value with lots of 2 + factors because of mantissa truncation */ + } else + mpz_set_str (*res, argv[i], 10); + + return 1; + } + + return 0; +} + + +/* Searches for matches in the lists + F={f_S(i) % modulus : 0 0) + j++; + if (j < lenG && mpz_cmp (F[i], G[j]) == 0 && + (startG > 0 || i+j > 0)) /* Eliminate spurious factors in Dickson */ + { + unsigned int tj = j; + if (nrfactors == 0 && luckyF != NULL && luckyG != NULL) + { + *luckyF = indexF[i]; + *luckyG = indexG[j]; + } + while (tj < lenG && mpz_cmp (F[i], G[tj]) == 0) + { + nrfactors++; + tj++; + } + } + } + + if (F_param == NULL) + { + for (i = 0; i < lenF; i++) + mpz_clear (F[i]); + free (F); + } + + free (indexF); + + if (G_param == NULL) + { + for (i = 0; i < lenG; i++) + mpz_clear (G[i]); + free (G); + } + + free (indexG); + + mpz_clear (t); + + return nrfactors; +} + +/* Compute probability that value "modulus" is found by Brent-Suyama extension */ +double +brent_suyama_theo (mpz_t modulus, unsigned int S, unsigned int D, + unsigned int startG, unsigned int endG, mpz_t a) +{ + unsigned int mod_S, gcd_S; + double nr_points, p; + + if (endG < startG || D <= 1 || S == 0 || S & 1) + return 0.; + + /* See if the linear factors catch it */ + if (mpz_cmp_d (modulus, (double)D * (double)(startG)) > 0 && + mpz_cmp_d (modulus, (double)D * (double)(endG)) < 0) + return 1.0; + + mod_S = mpz_fdiv_ui (modulus, S); + if (mpz_sgn (a) == 0) + gcd_S = gcd (mod_S - 1, S) - 2; + else + gcd_S = (gcd (mod_S - 1, S) + gcd (mod_S + 1, S)) / 2 - 2; + + nr_points = (double) get_lenF (D) * (double) (endG - startG + 1); + p = mpz_get_d (modulus); + + return 1.0 - pow(1.0 - (double)gcd_S / p, nr_points); +} + +void +print_help () +{ + printf ("countsmooth [-N ] [-tests ] [-blocks ] [-B1 ]\n" + " [-B2 ] [-S ] [-D ] [-startG ] [-endG ] [-a ]\n" + " [-maxBS ] [-v] [-theo] [-pm1] [-ecm]\n\n\n"); + + printf (" Determines which values in [N, N+nrtests-1] are smooth according to given\n" + " parameters.\n"); + printf (" A value is smooth if its factorization contains only primes and prime\n" + " powers <= B1 (B1-smooth), or if one prime factor is >B1 but <=B2 and all\n" + " others are B1-smooth, or if the largest prime factor can be expressed as\n" + " f_{S,a}(k*D) - f_{S,a}(i), where startG<=k<=endG and 0 Start of range of numbers to invesigate.\n"); + printf (" Length of range of numbers to investigate.\n"); + printf (" Split range into this many blocks to conserve memory.\n" + " By default = 1\n"); + printf (" B1-smoothness limit.\n"); + printf (" B2-smoothness limit.\n"); + printf (" Degree of Brent-Suyama polynomial in stage 2.\n"); + printf (" Stride for roots of G in stage 2.\n"); + printf (" Starting value for roots of G is * .\n" + " Default is floor( / )\n"); + printf (" Ending value for roots of G is * .\n" + " Default is floor( / ) + 1\n"); + printf (" Parameter for Dickson polynomial. If == 0, -th powers\n" + " are used. = 0 is the default.\n"); + printf (" Brent-Suyama extension is only tried on cofactors <= \n" + " Default is * 1000\n"); + printf (" -v Verbose. Print info on each value found smooth.\n"); + printf (" -theo Don't really compute and match Brent-Suyama values, just\n" + " calculate the probability of success and add up expected values.\n"); + printf (" -pm1 Investigate only those values in [N, N+nrtest-1] that are\n" + " a prime minus 1\n"); + printf( " -ecm Adjust for the ECM algorithm (divides N by 12, doubles S)\n"); +} + +int +main(int argc, char **argv) +{ + mpz_t N, a, *cofac; + unsigned int len_cofac; + unsigned int p, B1=0, i, D, S, startG , endG, Nmod12, blocklen; + unsigned int nr_tests = 0, nr_blocks = 1, blockstart = 0, nr_primes = 0; +#ifdef PRIMEGEN + primegen pg[1]; +#endif + double ppow, + B2=0., + maxBS = 0., /* Try Brent-Suyama only on cofactors <= maxBS */ + nrBSsmooth = 0.; /* Nr (or expected value) of Brent-Suyama successes */ + unsigned int nrB1smooth = 0, nrB2smooth = 0; + int verbose = 0, theo = 0, ecm_style = 0, pminus1 = 0; + mpz_t *F, *G; + + mpz_init (N); + mpz_init (a); + D = endG = S = 0; + startG = 0xffffffff; + B1 = 1000000; + B2 = 100000000; + nr_tests = 10000; + mpz_set_str (N, "8333333333333333333333333331", 10); + + /* Get parameters */ + + for (i = 1; i < (unsigned int) argc; i++) + { + if (strcmp (argv[i], "-v") == 0) + verbose = 1; + else if (strcmp (argv[i], "-theo") == 0) + theo = 1; + else if (strcmp (argv[i], "-ecm") == 0) + ecm_style = 1; + else if (strcmp (argv[i], "-pm1") == 0) + pminus1 = 1; + else if (getparm_ui (argc, argv, i, "-tests", &nr_tests)) + i++; + else if (getparm_ui (argc, argv, i, "-blocks", &nr_blocks)) + i++; + else if (getparm_ui (argc, argv, i, "-B1", &B1)) + i++; + else if (getparm_d (argc, argv, i, "-B2", &B2)) + i++; + else if (getparm_ui (argc, argv, i, "-D", &D)) + i++; + else if (getparm_ui (argc, argv, i, "-S", &S)) + i++; + else if (getparm_ui (argc, argv, i, "-startG", &startG)) + i++; + else if (getparm_ui (argc, argv, i, "-endG", &endG)) + i++; + else if (getparm_mpz (argc, argv, i, "-N", &N)) + i++; + else if (getparm_mpz (argc, argv, i, "-a", &a)) + i++; + else if (getparm_d (argc, argv, i, "-maxBS", &maxBS)) + i++; + else + { + printf ("Don't know parameter %s\n", argv[i]); + print_help(); + exit (EXIT_FAILURE); + } + } + + if (maxBS == 0.) + maxBS = B2 * 1000.; + + if (startG == 0xffffffff && D > 1) + startG = B1 / D; + + if (endG == 0 && D > 1) + { + if (B2 > (double)(~1U)*(double)D) + { + fprintf (stderr, + "Overflow error computing default endG. Please set explicitly\n"); + exit (EXIT_FAILURE); + } + endG = (unsigned int) (B2 / (double) D); + } + + if (ecm_style) + { + /* ECM is known to have a group order divisible by 12, which + effectively makes the part that has yet to be smooth N/12. + Instead of generating multiples of 12 and sieving those, + we simply divide N by 12 here. (This leads to rounding down if N + was not a multiple of 12, which we shouls correct when printing.) + + ECM finds a factor if p|f_S(x)-f_S(y) or p|f_S(x)+f_S(y), (because + a point and it's negative have the same x-coordinate) where p is + the missing group order factor, and f_S(x) and f_S(y) are the + polynomial values examined by the Brent-Suyma extension in stage 2. + If p is prime, this is equivalent to the condition + p|(f_S(x)-f_S(y))(f_S(x)+f_S(y)), and for both f_S(x)=x^S and + f_S(x) = S-th Dickson polynomials, + (f_S(x)-f_S(y))(f_S(x)+f_S(y)) = f_{2S}(x)-f_{2S}(y). + So effectively, ECM behaves as if S were twice as large, which we + account for by simply doubling S here. */ + + Nmod12 = mpz_fdiv_q_ui (N, N, 12); + S = S * 2; + } + + if (verbose) + { + printf ("B1=%u, B2=%.0f, %s%u, D=%u, %u<=G<=%u\nN=", + B1, B2, mpz_sgn(a) ? "Dickson_" : "X^", S, D, startG, endG); + mpz_out_str (stdout, 10, N); + printf ("\n"); + fflush (stdout); + } + + len_cofac = (nr_tests + nr_blocks - 1) / nr_blocks; + cofac = (mpz_t *) malloc (len_cofac * sizeof (mpz_t)); + for (i = 0; i < len_cofac; i++) + mpz_init (cofac[i]); + + F = (mpz_t *) malloc (get_lenF (D) * sizeof (mpz_t)); + for (i = 0; i < get_lenF (D); i++) + mpz_init (F[i]); + + G = (mpz_t *) malloc ((endG - startG + 1) * sizeof (mpz_t)); + for (i = 0; i < (endG - startG + 1); i++) + mpz_init (G[i]); + + while (nr_tests > 0) + { + if (nr_tests < len_cofac) + blocklen = nr_tests; + else + blocklen = len_cofac; + + /* Init cofac */ + for (i = 0; i < blocklen; i++) + mpz_add_ui (cofac[i], N, i); + + /* Do the sieving */ +#ifdef PRIMEGEN + primegen_init (pg); + for (p = primegen_next (pg); p <= B1; p = primegen_next (pg)) +#else + for (p = 2; p <= B1; p = getprime ()) +#endif + { + /* Compute first sieve location where p divides */ + i = mpz_fdiv_ui (N, p); + if (i > 0) + i = p - i; + + while (i < blocklen) + { + if (mpz_fdiv_q_ui (cofac[i], cofac[i], p) != 0) + { + fprintf (stderr, "%d does not divide cofac[%d]\n", p, i); + exit (EXIT_FAILURE); + } + /* Take out prime powers. Not very efficient. */ + ppow = (double)p * (double)p; + while (ppow <= B1 && mpz_fdiv_ui (cofac[i], p) == 0) + { + mpz_fdiv_q_ui (cofac[i], cofac[i], p); + ppow *= (double) p; + } + i += p; + } + } + + /* Check which cofactors are small enough and report */ + for (i = 0; i < blocklen; i++) + { + if (pminus1) /* Report only primes - 1 if pminus1 is set */ + { + /* Test if N+i+1 is prime. If it is, obviously no factors + can have been divided out of it, and the cofactor must + still be larger than N. */ + if ((i + 1 == blocklen || mpz_cmp (cofac[i + 1], N) > 0) && + is_P_minus_i (N, i+1)) + nr_primes++; + else + continue; + } + + if (mpz_cmp_ui (cofac[i], 1) == 0) + { + if (verbose) + printf ("N%s+%d: B1-smooth\n", ecm_style ? "/12" : "", i+blockstart); + + nrB1smooth++; + continue; + } + + if (mpz_cmp_d (cofac[i], B2) <= 0) + { + if (verbose) + { + printf ("N%s+%d: ", ecm_style ? "/12" : "", i + blockstart); + mpz_out_str (stdout, 10, cofac[i]); + printf ("\n"); + } + + nrB2smooth++; + continue; + } + + if (mpz_cmp_d (cofac[i], maxBS) <= 0) + { + if (theo) + { + double pr; + pr = brent_suyama_theo (cofac[i], S, D, startG, endG, a); + nrBSsmooth += pr; + if (verbose && pr >= 0.3) + { + printf ("N%s+%d: ", ecm_style ? "/12" : "", i + blockstart); + mpz_out_str (stdout, 10, cofac[i]); + printf (" (Brent-Suyama, pr=%f)\n", pr); + } + } else { + unsigned int luckyF, luckyG; + if (brent_suyama_match (cofac[i], S, D, startG, endG, a, F, G, &luckyF, &luckyG)) + { + unsigned int j; + + for (j = 2; j < S; j += 2) + if (S % j == 0 && + brent_suyama_match (cofac[i], j, D, startG, endG, a, F, G, &luckyF, &luckyG)) + break; + + if (verbose) + { + printf ("N%s+%d: ", ecm_style ? "/12" : "", i+blockstart); + mpz_out_str (stdout, 10, cofac[i]); + printf (" (Brent-Suyama, divides "); + if (mpz_sgn(a)) + printf( "Dickson_%u(D*%u)-Dickson_%u(%u))\n", j, luckyG, j, luckyF); + else + printf( "(D*%u)^%u-%u^%u)\n", luckyG, j, luckyF, j); + } + + nrBSsmooth++; + continue; + } + } + } + } + + nr_tests -= blocklen; + blockstart += blocklen; + mpz_add_ui (N, N, blocklen); +#ifndef PRIMEGEN + getprime_clear (); +#endif + } + + for (i = 0; i < get_lenF (D); i++) + mpz_clear (F[i]); + free (F); + + for (i = 0; i < (endG - startG + 1); i++) + mpz_clear (G[i]); + free (G); + + printf ("B1-smooth: %d, B2-smooth: %d, found by Brent-Suyama: %f, Total: %d\n", + nrB1smooth, nrB2smooth, nrBSsmooth ,nrB1smooth + nrB2smooth + + (unsigned int) nrBSsmooth); + + if (pminus1) + printf ("Number of N that are a prime - 1: %d\n", nr_primes); + + return 0; +} diff -Nru gmp-ecm-7.0.4+ds/nodist/rho.gp gmp-ecm-7.0.5+ds/nodist/rho.gp --- gmp-ecm-7.0.4+ds/nodist/rho.gp 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/nodist/rho.gp 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,386 @@ +/* + + Estimating the probability of success of the Elliptic Curve Method + + Copyright 2004, 2005 Alexander Kruppa. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; see the file COPYING. If not, write to the Free + Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. + + Version 0.1.1 + + History: 0.1 2004.09.29 + Initial release + 0.1.1 2005.06.14 + Changed extra smoothness of ECM from 12 to 23.4 + Started on pm1prob(), but incomplete yet + + How to use this file: + + 0) first start gp (http://pari.math.u-bordeaux.fr/) + + 1) load the file into gp: + + ? read ("rho.gp") + + 2) then call + + ? ecmprob(B1,B2,N,nr,S) + + where N is the approximate factor, nr is the number of points evaluated + in stage 2, i.e. nr = k*dF^2, S is the degree for Brent-Suyama. + Passing S>0 means S-th powers, -S means Dickson polynomials, S=0 means + no Brent-Suyama extension will be considered. + + The parameters can be obtained by running gmp-ecm with the -v + command line option. I.e. for B1=1000000 + + $ echo 65537 | ecm5 -v 1e6-1e6 + GMP-ECM 5.0.3 [powered by GMP 4.1.2] [ECM] + Input number is 65537 (5 digits) + Using special division for factor of 2^16+1 + Using B1=1000000, B2=839549780, polynomial Dickson(6), sigma=2157207190 + a=51553 + starting point: x=19319 + Step 1 took 0ms + x=19319 + B2'=948726240 k=5 b2=189560910 d=43890 dF=4320 + Initializing table of differences for F took 0ms + Found factor while computing F[49] + Step 2 took 20ms for 0 muls + ********** Factor found in step 2: 65537 + Found input number N + + So gmp-ecm internally uses B2=948726240, k=5, dF=4320 and + S=-6 (Dickson(6)). + + Passing these paramters to ecmprob, we get + + ? ecmprob(1000000,948726240,sqrt(10)*10^34,5*4320^2,-6) + %5 = 0.0009601784546838897811362317141 + ? 1./% + %6 = 1041.473066930272122308814268 + ? + + Thus, the expected number of curves to find a p35 factor with + gmp-ecm and B1=1000000 is approximately 1041. + +*/ + + +/* Returns \int_{1}^{x} 1 - rho(t-1)/t for 1 <= x <= 2. + This function is not actually used any more, rhoexact() needs a + differece L2(x)-L2(2) which can be simplified. */ +L2 (x) = +{ +/* \int (1 - Log(x-1))/x dx = + Log(x) + Log(x)*(Log(1-x) - Log(x-1)) + Dilog(x) + Dilog(x) = Pi^2/6 - Log(x)*Log(1-x) - Dilog(1-x) + thus: L2(x) = Log(x) - Log(x)*Log(x-1) + Pi^2/6 - Dilog(1-x) + L2(2) = Pi^2/4 + Log(2) */ + + return (log (x) * (1 - log (x-1)) + Pi ^ 2 / 6 - real (dilog (1 - x))) +} + +/* +L3(x) = +{ + \int L2(x-1)/x dx = + \int (1 - Log[x-1] + Log[x-1]*Log[x-2] + PolyLog[2,2-x] + Pi^2/12) / x + = Log[x] + + Log(x) * (Log(1-x) - Log(x-1)) + Dilog(x) + + (huge term, fearsome) + - PolyLog[3,2-x] + + Pi^2 / 12 * Log[x] +} +*/ + +rhoexact(x) = +{ + if (x <= 0., return (0.)); + if (x <= 1., return (1.)); + +/* 1 - \int_{1}^{x} rho(t-1)/t dt = + 1 - \int_{1}^{x} 1/t dt = + 1 - (log(x) - log(1)) = 1 - log(x) */ + + if (x <= 2., + return (1. - log(x))); + +/* For 2 <= x <= 3, + 1 - \int_{1}^{x} rho(t-1)/t dt = + 1 - \int_{1}^{2} 1/t dt - \int_{2}^{x} (1-log(t-1))/t dt = + 1 - log(2) - (L2(x) - L2(2)) + simplified, see L2() function. The real() is there because Pari returns a + complex result for dilog(x), x<~-0.7, with very small imaginary part, even + though the result should be purely real */ + + if (x <= 3., + return (1. - log (x) * (1. - log (x - 1.)) + real (dilog (1. - x)) + Pi ^ 2 / 12.)); + + error ("rhoexact: argument > 3"); +} + +/* With invh = 200, rho(8) = 0.000000032319, Knuth/Trapp-Pardo say ...21, + rho(9) = 0.000000001015, Knuth/Trapp-Pardo say ...16 + With invh = 400, all digits match Knuth/Trapp-Pardo (after rounding) */ + +tablemax = 10; +invh = 512; +h = 1. / invh; +rhotable = listcreate (tablemax * invh); +for (i = 1, 3 * invh - 1, listput (rhotable, rhoexact (i * h), i)) +/* FIXME: add listput (rhotable, rhoexact (3.), 3 * invh) here? */ +/* Boole's rule. The h conveniently cancel */ +for (i = 3 * invh, tablemax * invh, \ + listput (rhotable, rhotable[i - 4] - 2. / 45. * \ + ( \ + 7. * rhotable[i - invh - 4] / (i - 4.) \ + + 32. * rhotable[i - invh - 3] / (i - 3.) \ + + 12. * rhotable[i - invh - 2] / (i - 2.) \ + + 32. * rhotable[i - invh - 1] / (i - 1.) \ + + 7. * rhotable[i - invh] / i \ + ), i \ + ) \ +) + +/* The rho function as defined by Karl Dickman, or by Knuth/Trapp-Pardo (4.1)-(4.4), + for alpha < tablemax. For alpha >= tablemax, returns 0. */ +dickmanrho (alpha) = +{ + local (a, rho1, rho2); + if (alpha <= 0., return (0.)); + if (alpha <= 3., return (rhoexact (alpha))); + if (alpha < tablemax, + a = floor (alpha * invh); + rho1 = rhotable[a]; + rho2 = rhotable[a + 1]; + /* Linear interpolation. Should use a better model */ + return (rho1 + (rho2 - rho1) * (alpha * invh - a)); + ); + return (0.); +} + +/* The density of x^(1/alpha)-smooth positive integers below x with first + correction term, (4.8), (4.15) */ +dickmanrhosigma (alpha, x) = +{ + local(r); + if (alpha <= 0., return (0.)); + if (alpha <= 1., return (1.)); + if (alpha < tablemax, + return (dickmanrho (alpha) + + (1. - Euler) * dickmanrho (alpha - 1.) / log (x)); + ); + return (0.); +} + +/* Same, but ai is an index to rhotable, i.e. ai*h = alpha */ +dickmanrhosigma_i (ai, x) = +{ + if (ai <= 0, return (0.)); + if (ai <= invh, return (1.)); + if (ai <= tablemax * invh, + return (rhotable[ai] + (1. - Euler) * rhotable[ai-invh] / log (x)) + ); + return (0.); +} + +/* The density of x^(1/alpha)-smooth integers around x */ +dickmanlocal (alpha, x) = +{ + if (alpha <= 0., return (0.)); + if (alpha <= 1., return (1.)); + /* Avoid case where alpha >= tablemax, but alpha - 1 < tablemax which + would give negative result */ + if (alpha <= tablemax, + return (dickmanrho (alpha) - Euler * dickmanrho (alpha - 1.) / log (x)); + ); + return (0); +} + +dickmanlocal_i (ai, x) = +{ + if (ai <= 0, return (0.)); + if (ai <= invh, return (1.)); + if (ai <= 2 * invh, +/* dickmanrhosigma_i(ai, x) - dickmanrhosigma_i(ai-invh, x)/log(x), simplified */ + return (rhotable[ai] - Euler / log (x)) + ); + if (ai <= tablemax * invh, + return ( + rhotable[ai] - (Euler * rhotable[ai - invh] + + (1. - Euler) * rhotable[ai - 2 * invh] / log (x)) / log (x) + ) + ); + return (0); +} + +/* Probability that a number around x has all prime factors <= B1, + and exactly one prime p with B1 < p <=B2. + Does a sum over primes p rather than an integral, which is + more accurate for small B2 */ +dickmanmu_noint (B1, B2, x) = +{ + local (p, s); + s = 0.; + /* print ("dickmanmu_noint ", B1, ", ", B2, ", ", x); */ + forprime(p = B1 + 1, B2, + s += dickmanlocal (log(x/p) / log(B1), x / p) / p; + ); + return(s); +} + +/* Probability that a number around x has all prime factors <=x^(1/alpha), + and exactly one >x^(1/alpha), <=x^(beta/alpha) */ +dickmanmu (alpha, beta, x) = +{ + local (a, ai, b, bi); + ai = ceil ((alpha - beta) * invh); + a = ai * h; + bi = floor ((alpha - 1.) * invh); + b = bi * h; + return ( + h * ( +/* Trapezoidal rule. Could be improved */ + sum (i = ai, bi, dickmanlocal_i (i, x) / (alpha - i * h)) + - (dickmanlocal_i (ai, x) / (alpha - a) + dickmanlocal_i (bi, x) / (alpha - b)) / 2. + ) + + (a - alpha + beta) * (dickmanlocal_i (ai, x) / (alpha - a) + dickmanlocal (alpha - beta, x) / beta) / 2. + + (alpha - 1. - b) * (dickmanlocal (alpha - 1., x) + dickmanlocal_i (bi, x) / (alpha - b)) / 2. + ); +} + +brentsuyama (B1, B2, N, nr) = +{ + local (a, ai, i, alpha, beta); + alpha = log (N) / log (B1); + beta = log (B2) / log (B1); + ai = floor ((alpha - beta) * invh); + a = ai * h; + return ( + h * ( + sum (i = 1, ai - 1, + dickmanlocal_i (i, N) / (alpha - i * h) * (1 - exp(-nr * B1 ^ (-alpha + i * h))) + ) +/* Between 0 and h, rho() is 1 everywhere except at 0, so we take it as 1 */ + + (1 - exp(-nr / B1 ^ alpha)) / 2. + + dickmanlocal_i (ai, N) / (alpha - a) * (1 - exp(-nr * B1 ^ (-alpha + a))) / 2. + ) + + (alpha - beta - a) * ( + dickmanlocal_i (ai, N) / (alpha - a) + dickmanlocal (alpha - beta, N) / beta + ) / 2. + ); +} + +/* Probability that the difference of two degree S Dickson polynomials, each + evaluated at nr random but distinct points, includes a prime p > B2 so + that N / p is B1-smooth. The linear factors of Dickson_S(a)-Dickson_S(b) + are assumed < B2 ! */ +brsudickson (B1, B2, N, nr, S) = +{ + local (i, n, phi); + n = 0; + phi = eulerphi (S) / 2; + for (i = 1, S / 2, + if (gcd (i, S) == 1, +/* redundancy could be avoided by counting how often each gcd(i,S) value occurs */ + n = n + brentsuyama (B1, B2, N, nr * (gcd (i - 1, S) + gcd (i + 1, S) - 4) / 2) + ) + ); + return (n / phi); +} + +/* Same, but for S-th power */ +brsupower (B1, B2, N, nr, S) = +{ + local (i, n, phi); + n = 0; + phi = eulerphi (S); + for (i = 1, S, + if (gcd (i, S) == 1, + n = n + brentsuyama (B1, B2, N, nr * (gcd (i - 1, S) - 2)) + ) + ); + return (n / phi); +} + +gen_prob (B1, B2, N, nr, S, delta) = +{ + local (alpha, beta, stage1, stage2, brsu, Nadj); + Nadj = N / exp(delta); + alpha = log (Nadj) / log (B1); + beta = log (B2) / log (B1); + stage1 = dickmanlocal (alpha, Nadj); + stage2 = 0; + if (B2 > B1, + if (B2 < 20000, + stage2 = dickmanmu_noint (B1, B2, Nadj); + , + stage2 = dickmanmu (alpha, beta, Nadj); + ); + ); + brsu = 0; + if (S < -1, + brsu = brsudickson (B1, B2, Nadj, nr, -S * 2) + ); + if (S > 1, + brsu = brsupower (B1, B2, Nadj, nr, S * 2) + ); + /* print ("gen_prob: stage 1: ", stage1, ", stage 2: ", stage2, + ", Brent-Suyama: ", brsu); */ + return (stage1 + stage2 + brsu) +} + +/* The probability of ECM finding a factor near N with stage 1 parameter B1, + stage 2 parameter B2, and evaluating nr random but distinct points in + stage 2 with a degree -S Dickson polynomial (if S < 0) or the + S-th power (S > 0) as the Brent-Suyama function */ +ecmprob (B1, B2, N, nr, S, delta) = +{ + local (ldelta); + ldelta = 3.134; /* delta value for Brent-Suyama, but not sigma=11 */ + if (delta != 0, ldelta = delta); + return (gen_prob(B1, B2, N, nr, S, ldelta)); +} + +/* pm1prob is incomplete! */ + +pm1prob (B1, B2, N, nr, S) = +{ + local(delta); + /* The "root properties" of a large prime minus 1 are + alpha = sum_{p in Primes} log(p)/(p^2-2p+1) ~= 1.2269688... */ + delta = 1.2269688056534700059656625687457626422689456478473; + return (gen_prob(B1, B2, N, nr, S, delta)); +} + +pp1prob4 (B1, B2, N, nr, S) = +{ + local(delta); + /* The "root properties" of a large prime minus 1 are + alpha = sum_{p in Primes} log(p)/(p^2-2p+1) ~= 1.2269688... */ + delta = 1.2269688056534700059656625687457626422689456478473 + log(2); + return (gen_prob(B1, B2, N, nr, S, delta)); +} + +pp1prob6 (B1, B2, N, nr, S) = +{ + local(delta); + /* The "root properties" of a large prime minus 1 are + alpha = sum_{p in Primes} log(p)/(p^2-2p+1) ~= 1.2269688... */ + delta = 1.2269688056534700059656625687457626422689456478473 + 3/4*log(3); + return (gen_prob(B1, B2, N, nr, S, delta)); +} diff -Nru gmp-ecm-7.0.4+ds/nodist/runecm2.c gmp-ecm-7.0.5+ds/nodist/runecm2.c --- gmp-ecm-7.0.4+ds/nodist/runecm2.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/nodist/runecm2.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,353 @@ +/* Program running ecm factoring processes in parallel, minimizing simultaneous + step2. + + Copyright 2005 Torbjörn Granlund. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; see the file COPYING. If not, write to the Free + Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + 02111-1307, USA. + + FIXME: We should clean up its reporting and logging functions, as well + as its error handling, if we release it. +*/ + +#include /* for fork, wait4 */ +#include /* for fork */ +#include /* for open */ +#include /* for wait4 */ +#include /* for gettimeofday, wait4 */ +#include /* for wait4 */ +#include +#include +#include +#include + +enum foo {NONE = 0, STEP1_RUNNING, STEP2_RUNNING, STEP1_DONE, STEP2_DONE}; + +struct jobinfo +{ + pid_t pid; + enum foo state; + char filename[50]; +}; + +#define RUNNING(x) (x.state == STEP1_RUNNING || x.state == STEP2_RUNNING) + +char *progname; + +unsigned long +bush () +{ + unsigned long ran; + int fd; + + fd = open ("/dev/urandom", O_RDONLY); + if (fd >= 0) + { + unsigned char buf[sizeof (long)]; + size_t nread; + nread = read (fd, buf, sizeof (long)); + if (nread != sizeof (long)) + goto stupid; + ran = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3]; + if (sizeof (long) > 4) + { + unsigned long ran2; + ran2 = (buf[4] << 24) + (buf[5] << 16) + (buf[6] << 8) + buf[7]; + ran += (ran2 << 31) << 1; + } + close (fd); + return ran; + } + else + { + static int flag = 0; + stupid: + if (flag == 0) + { + struct timeval tp; + gettimeofday (&tp, NULL); + srand48 ((tp.tv_sec << 16) + tp.tv_usec + getpid ()); + flag = 1; + } + ran = mrand48 (); + } + return ran; +} + +char * +pathfind (const char *command) +{ + char *path, *p, *buf; + int len, clen; + + clen = strlen (command); + + path = getenv ("PATH"); + if (path == NULL) + abort (); + + buf = malloc (strlen (path) + clen + 2); + + for (;;) + { + p = strchr (path, ':'); + if (p == NULL) + len = strlen (path); + else + len = p - path; + + memcpy (buf, path, len); + if (buf[len - 1] != '/') + { + buf[len] = '/'; + memcpy (buf + len + 1, command, clen + 1); + } + else + { + memcpy (buf + len, command, clen + 1); + } + if (access (buf, X_OK) == 0) + return buf; + + path += len + 1; + } + + free (buf); + return NULL; +} + +#define BUFSIZE 65536 + +int +main (int argc, char *argv[], char *envp[]) +{ + int nprocs, i; + char *B1, *B2; + struct jobinfo *jiv; + pid_t pid; + int wstat; + char *tmpdir; + char filename[50]; + int (*result_chan)[2]; /* for reading output from passed */ + int n_running_procs; + int fd; + char *ecmfactor; + char buf[BUFSIZE]; + char sigma[20]; + int next_cofac_i; + size_t nread; + struct rusage rus; + unsigned used_ms; + + nprocs = 1; /* default */ + B1 = NULL; + B2 = NULL; + next_cofac_i = 0; + + progname = argv[0]; + argv++; + argc--; + + while (argc >= 2 && argv[0][0] == '-') + { + if (strcmp ("-B1", argv[0]) == 0) + { + B1 = argv[1]; + argv += 2; + argc -= 2; + } + else if (strcmp ("-B2", argv[0]) == 0) + { + B2 = argv[1]; + argv += 2; + argc -= 2; + } + else if (strcmp ("-n", argv[0]) == 0) + { + nprocs = strtoul (argv[1], 0, 0); + argv += 2; + argc -= 2; + } + else + { + fprintf (stderr, "%s: unknown option: %s\n", progname, argv[0]); + exit (1); + } + } + printf ("There seem to be %d cofactor files\n", argc); + + if (B1 == NULL) + { + fprintf (stderr, "%s: missing B1 value\n", progname); + exit (1); + } + + ecmfactor = pathfind ("ecmfactor"); + + result_chan = malloc (nprocs * sizeof (int [2])); + + jiv = malloc (nprocs * sizeof (struct jobinfo)); + for (i = 0; i < nprocs; i++) + { + jiv[i].pid = 0; + jiv[i].state = NONE; + pipe (result_chan[i]); + } + + tmpdir = getenv ("TMPDIR"); + if (tmpdir == NULL) + tmpdir = "/tmp"; + + n_running_procs = 0; + + for (;;) + { + int n_step2_procs = 0; + for (i = 0; i < nprocs; i++) + { + n_step2_procs += (jiv[i].state == STEP2_RUNNING); + } + for (i = 0; i < nprocs; i++) + { + if (! RUNNING (jiv[i])) + { + if (jiv[i].state == STEP1_DONE) + { + if (n_step2_procs * 2 >= nprocs) + continue; + fprintf (stderr, "STARTING NEW STEP 2 JOB\n"); + pid = fork (); + if (pid == 0) + { + /* Child */ + char *outv[6], **op = outv; + dup2 (result_chan[i][1], 1); + close (result_chan[i][0]); + close (result_chan[i][1]); + *op++ = "ecmfactor"; + *op++ = "-resume"; + *op++ = jiv[i].filename; + *op++ = B1; + if (B2 != NULL) + *op++ = B2; + *op = NULL; + execve (ecmfactor, outv, envp); + fprintf (stderr, "cannot execute %s\n", ecmfactor); + abort (); + } + n_running_procs++; + jiv[i].pid = pid; + jiv[i].state = STEP2_RUNNING; + continue; + } + + for (;;) + { + if (next_cofac_i == 0) + sprintf (sigma, "%lu", bush ()); + strcpy (filename, argv[next_cofac_i]); + next_cofac_i = (next_cofac_i + 1) % argc; + fd = open (filename, O_RDONLY); + if (fd != -1) + break; + usleep (50000); + } + + sprintf (jiv[i].filename, "%s/ecm-save-%u", tmpdir, i); + unlink (jiv[i].filename); + fprintf (stderr, "STARTING NEW STEP 1 JOB\n"); + pid = fork (); + if (pid == 0) + { + /* Child */ + char *outv[8], **op = outv; + dup2 (result_chan[i][1], 1); + close (result_chan[i][0]); + close (result_chan[i][1]); + dup2 (fd, 0); + close (fd); + *op++ = "ecmfactor"; + *op++ = "-save"; + *op++ = jiv[i].filename; + *op++ = "-sigma"; + *op++ = sigma; + *op++ = B1; + *op++ = "1"; + *op = NULL; + execve (ecmfactor, outv, envp); + fprintf (stderr, "cannot execute %s\n", ecmfactor); + abort (); + } + close (fd); + n_running_procs++; + jiv[i].pid = pid; + jiv[i].state = STEP1_RUNNING; + continue; + } + } + + fprintf (stderr, "ABOUT TO WAIT (%d jobs running)\n", n_running_procs); + pid = wait4 (0, &wstat, 0, &rus); + if (pid == -1) + { + fprintf (stderr, "wait returned error %d\n", errno); + abort (); + } + n_running_procs--; + + used_ms = rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; + + if (WIFSIGNALED (wstat)) + { + fprintf (stderr, "*** child got signal %d\n", WTERMSIG (wstat)); + exit (1); + } + for (i = 0; i < nprocs; i++) + { + if (jiv[i].pid == pid) + goto yee; + } + abort (); + yee: + if (jiv[i].state == STEP1_RUNNING) + { + nread = read (result_chan[i][0], buf, BUFSIZE); + jiv[i].state = STEP1_DONE; + fprintf (stderr, "STEP 1 JOB %d FINISHED (used %u ms)\n", i, used_ms); + } + else if (jiv[i].state == STEP2_RUNNING) + { + unlink (jiv[i].filename); + nread = read (result_chan[i][0], buf, BUFSIZE); + jiv[i].state = STEP2_DONE; + fprintf (stderr, "STEP 2 JOB %d FINISHED (used %u ms)\n", i, used_ms); + } + else + abort (); + + if (WEXITSTATUS (wstat) == 0) + { + FILE *fs; + fs = popen ("mail -s \"NEW FACTOR\" invalid@example.net", "w"); + fwrite (buf, nread, 1, fs); + pclose (fs); + } + } +} + +/* + To perform only step 1: $ ./ecm -save toto B1 1 < composite + Then to perform step 2: $ ./ecm -resume toto B1 [B2] +*/ diff -Nru gmp-ecm-7.0.4+ds/parametrizations.c gmp-ecm-7.0.5+ds/parametrizations.c --- gmp-ecm-7.0.4+ds/parametrizations.c 2016-06-15 15:48:43.000000000 +0000 +++ gmp-ecm-7.0.5+ds/parametrizations.c 2022-06-06 14:16:49.000000000 +0000 @@ -236,22 +236,19 @@ int get_curve_from_param1 (mpres_t A, mpres_t x0, mpz_t sigma, mpmod_t n) { - int i; mpz_t tmp; mpz_init (tmp); ASSERT (GMP_NUMB_BITS == 64); - mpz_mul (tmp, sigma, sigma); /* tmp = sigma^2*/ - - /* A=4*d-2 with d = sigma^2/2^GMP_NUMB_BITS*/ - /* Compute d = sigma^2/2^GMP_NUMB_BITS */ - for (i = 0; i < GMP_NUMB_BITS; i++) - { - if (mpz_tstbit (tmp, 0) == 1) - mpz_add (tmp, tmp, n->orig_modulus); - mpz_div_2exp (tmp, tmp, 1); - } + /* A=4*d-2 with d = sigma^2/2^64 */ + /* Compute d = sigma^2/2^64 */ + mpz_ui_pow_ui(tmp, 2, 64); + mpz_invert(tmp, tmp, n->orig_modulus); + + /* tmp = sigma^2/2^64 */ + mpz_mul (tmp, tmp, sigma); + mpz_mul (tmp, tmp, sigma); mpz_mod (tmp, tmp, n->orig_modulus); /* TODO add d!=-1/8*/ @@ -387,30 +384,16 @@ int get_curve_from_param3 (mpres_t A, mpres_t x0, mpz_t sigma, mpmod_t n) { - int i; mpz_t tmp; - mpz_t two32; - mpz_init (two32); - mpz_ui_pow_ui (two32, 2, 32); mpz_init (tmp); - /* sigma < 2^32 (it was generated for 32-bit machines) */ - /* To use it on a 64-bits machines one should multiplied it by 2^32 */ - if (GMP_NUMB_BITS == 64) - mpz_mul (tmp, sigma, two32); - else - mpz_set (tmp, sigma); - - /* A=4*d-2 with d = sigma/2^GMP_NUMB_BITS*/ - /* Compute d = sigma/2^GMP_NUMB_BITS */ - for (i = 0; i < GMP_NUMB_BITS; i++) - { - if (mpz_tstbit (tmp, 0) == 1) - mpz_add (tmp, tmp, n->orig_modulus); - mpz_div_2exp (tmp, tmp, 1); - } - + /* A=4*d-2 with d = sigma/2^32*/ + /* Compute d = sigma/2^32 */ + mpz_ui_pow_ui (tmp, 2, 32); + mpz_invert (tmp, tmp, n->orig_modulus); + mpz_mul (tmp, sigma, tmp); mpz_mod (tmp, tmp, n->orig_modulus); + /* TODO add d!=-1/8*/ if (mpz_sgn (tmp) == 0 || mpz_cmp_ui (tmp, 1) == 0) return ECM_ERROR; @@ -422,7 +405,6 @@ mpres_set_ui (x0, 2, n); mpz_clear(tmp); - mpz_clear (two32); return ECM_NO_FACTOR_FOUND; } diff -Nru gmp-ecm-7.0.4+ds/patch-config.guess.diff gmp-ecm-7.0.5+ds/patch-config.guess.diff --- gmp-ecm-7.0.4+ds/patch-config.guess.diff 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/patch-config.guess.diff 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,36 @@ +--- config.guess.bak 2009-04-01 12:07:34.000000000 +0200 ++++ config.guess 2009-04-01 12:11:37.000000000 +0200 +@@ -274,7 +274,8 @@ + # a function descriptor, not actual code. But this doesn't matter since + # AIX doesn't allow mfpvr anyway. + # +- cat >$dummy.c <<\EOF ++ if test "`uname`" != Darwin; then ++ cat >$dummy.c <<\EOF + #include + struct { + int n; /* force 4-byte alignment */ +@@ -312,15 +313,16 @@ + return 0; + } + EOF +- if ($CC_FOR_BUILD $dummy.c -o $dummy) >/dev/null 2>&1; then +- # This style construct is needed on AIX 4.3 to suppress the SIGILL error +- # from (*fun)(). Using $SHELL -c ./$dummy 2>/dev/null doesn't work. +- { x=`./$dummy`; } 2>/dev/null +- if test -n "$x"; then +- exact_cpu=$x ++ if ($CC_FOR_BUILD $dummy.c -o $dummy) >/dev/null 2>&1; then ++ # This style construct is needed on AIX 4.3 to suppress the SIGILL error ++ # from (*fun)(). Using $SHELL -c ./$dummy 2>/dev/null doesn't work. ++ { x=`./$dummy`; } 2>/dev/null ++ if test -n "$x"; then ++ exact_cpu=$x ++ fi + fi ++ rm -f $dummy.c $dummy.o $dummy $dummy.core + fi +- rm -f $dummy.c $dummy.o $dummy $dummy.core + + # Grep the linux kernel /proc/cpuinfo pseudo-file. + # Anything unrecognised is ignored, since of course we mustn't spit out diff -Nru gmp-ecm-7.0.4+ds/pentium4/autogen.py gmp-ecm-7.0.5+ds/pentium4/autogen.py --- gmp-ecm-7.0.4+ds/pentium4/autogen.py 2006-03-07 15:57:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/autogen.py 2022-06-06 14:16:49.000000000 +0000 @@ -3,6 +3,13 @@ import re import sys +# Final assembler statement to mark stack as not executable on linux elf platforms +# Single quotes are used around # to prevent M4 to discard them as comments. M4 will remove them. +noexecstack_statement = """ +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +""" def offaddr(addr, offset): if offset == 0: @@ -194,7 +201,7 @@ k = int(sys.argv[1]) if k == 1: - print """# + print("""# # mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # @@ -237,7 +244,7 @@ movl %edx, (%ecx) adcl $0, %eax ret -""" +""" + noexecstack_statement) else: - print mulredc_k_rolled(k) + print(mulredc_k_rolled(k) + noexecstack_statement) diff -Nru gmp-ecm-7.0.4+ds/pentium4/generate_all gmp-ecm-7.0.5+ds/pentium4/generate_all --- gmp-ecm-7.0.4+ds/pentium4/generate_all 2006-03-07 15:57:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/generate_all 2022-06-06 14:16:49.000000000 +0000 @@ -1,6 +1,6 @@ #!/bin/sh -for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do +for i in {1..20}; do ./autogen.py $i > mulredc$i.asm done diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc10.asm gmp-ecm-7.0.5+ds/pentium4/mulredc10.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc10.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc10.asm 2022-06-06 14:16:49.000000000 +0000 @@ -287,3 +287,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc11.asm gmp-ecm-7.0.5+ds/pentium4/mulredc11.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc11.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc11.asm 2022-06-06 14:16:49.000000000 +0000 @@ -307,3 +307,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc12.asm gmp-ecm-7.0.5+ds/pentium4/mulredc12.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc12.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc12.asm 2022-06-06 14:16:49.000000000 +0000 @@ -327,3 +327,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc13.asm gmp-ecm-7.0.5+ds/pentium4/mulredc13.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc13.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc13.asm 2022-06-06 14:16:49.000000000 +0000 @@ -347,3 +347,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc14.asm gmp-ecm-7.0.5+ds/pentium4/mulredc14.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc14.asm 2006-03-07 15:57:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc14.asm 2022-06-06 14:16:49.000000000 +0000 @@ -367,3 +367,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc15.asm gmp-ecm-7.0.5+ds/pentium4/mulredc15.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc15.asm 2006-03-07 15:57:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc15.asm 2022-06-06 14:16:49.000000000 +0000 @@ -387,3 +387,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc16.asm gmp-ecm-7.0.5+ds/pentium4/mulredc16.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc16.asm 2006-03-07 15:57:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc16.asm 2022-06-06 14:16:49.000000000 +0000 @@ -407,3 +407,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc17.asm gmp-ecm-7.0.5+ds/pentium4/mulredc17.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc17.asm 2006-03-07 15:57:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc17.asm 2022-06-06 14:16:49.000000000 +0000 @@ -427,3 +427,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc18.asm gmp-ecm-7.0.5+ds/pentium4/mulredc18.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc18.asm 2006-03-07 15:57:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc18.asm 2022-06-06 14:16:49.000000000 +0000 @@ -447,3 +447,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc19.asm gmp-ecm-7.0.5+ds/pentium4/mulredc19.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc19.asm 2006-03-07 15:57:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc19.asm 2022-06-06 14:16:49.000000000 +0000 @@ -467,3 +467,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc1.asm gmp-ecm-7.0.5+ds/pentium4/mulredc1.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc1.asm 2006-03-07 15:57:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc1.asm 2022-06-06 14:16:49.000000000 +0000 @@ -42,3 +42,7 @@ adcl $0, %eax ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc20.asm gmp-ecm-7.0.5+ds/pentium4/mulredc20.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc20.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc20.asm 2022-06-06 14:16:49.000000000 +0000 @@ -487,3 +487,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc2.asm gmp-ecm-7.0.5+ds/pentium4/mulredc2.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc2.asm 2006-03-07 15:57:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc2.asm 2022-06-06 14:16:49.000000000 +0000 @@ -127,3 +127,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc3.asm gmp-ecm-7.0.5+ds/pentium4/mulredc3.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc3.asm 2006-03-07 15:57:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc3.asm 2022-06-06 14:16:49.000000000 +0000 @@ -147,3 +147,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc4.asm gmp-ecm-7.0.5+ds/pentium4/mulredc4.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc4.asm 2006-03-07 15:57:40.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc4.asm 2022-06-06 14:16:49.000000000 +0000 @@ -167,3 +167,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc5.asm gmp-ecm-7.0.5+ds/pentium4/mulredc5.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc5.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc5.asm 2022-06-06 14:16:49.000000000 +0000 @@ -187,3 +187,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc6.asm gmp-ecm-7.0.5+ds/pentium4/mulredc6.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc6.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc6.asm 2022-06-06 14:16:49.000000000 +0000 @@ -207,3 +207,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc7.asm gmp-ecm-7.0.5+ds/pentium4/mulredc7.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc7.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc7.asm 2022-06-06 14:16:49.000000000 +0000 @@ -227,3 +227,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc8.asm gmp-ecm-7.0.5+ds/pentium4/mulredc8.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc8.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc8.asm 2022-06-06 14:16:49.000000000 +0000 @@ -247,3 +247,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/pentium4/mulredc9.asm gmp-ecm-7.0.5+ds/pentium4/mulredc9.asm --- gmp-ecm-7.0.4+ds/pentium4/mulredc9.asm 2006-03-07 15:57:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pentium4/mulredc9.asm 2022-06-06 14:16:49.000000000 +0000 @@ -267,3 +267,7 @@ emms ret +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif + diff -Nru gmp-ecm-7.0.4+ds/phiP.gp gmp-ecm-7.0.5+ds/phiP.gp --- gmp-ecm-7.0.4+ds/phiP.gp 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/phiP.gp 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,54 @@ + +largest_primefactor(n) = vecmax(factorint(n)~[1,]) + +/* We examine P values with eulerphi(P) in [mini, maxi], + maxi = mini + multphi*(len - 1) */ +/* We assume that P is divisible by multP and that + phiP and mini are divisible my multphi, so the array + entry phi[i] contains a P such that phiP = eulerphi(P) and + (phiP - mini) / multphi + 1 = i. (phi[i] contains zero if no such + P was found). Conversely, phiP = (i - 1) * multphi + mini */ + +/* mini = 150000000; */ +/* len = 10000000; */ +/* multP = 3*5*7*11; */ + +make_phiP (mini, len, multP, oldbest) = +{ + local (multphi, maxi, phi, P, phiP, best, multP2); + multphi = eulerphi(multP); + best = oldbest; + if (mini % multphi != 0, + error("mini = ", mini, " is not a multiple of multphi = ", multphi)); + maxi = mini + multphi * (len - 1); + phi = vector(len); + + /* For each candidate odd P value < 4*maxi and a multiple of multP, + if eulerphi(P) is in the [mini, maxi] range, store the P value at + phi[i] with i = (eulerphi(P) - mini) / multphi + 1 */ + multP2 = 2*multP; + P = 3*multP; + + while (P < maxi*4, + phiP = eulerphi(P); + if (phiP % multphi != 0, + error("phiP = ", phiP, " is not a multiple of multphi = ", multphi) + ); + if (mini <= phiP && phiP <= maxi, phi[(phiP - mini) / multphi + 1] = P); + P += multP2; + ); + + /* Go through the array and report large P, P/phi(P) combinations. + best contains the maximal value of P * phiP seen so far */ + + for (i = 1, len, + P = phi[i]; + phiP = (i - 1) * multphi + mini; + if (P * phiP > best * 1.05 && largest_primefactor(phiP) <= 19, + print(P, " = ", factorint(P), ", ", phiP, " = ", factorint(phiP), " P/phi(P) = ", 1. * P / phiP, " ", 1. * P * phiP / best); + best = P * phiP + ); + ); + + return (best); +} diff -Nru gmp-ecm-7.0.4+ds/pm1.c gmp-ecm-7.0.5+ds/pm1.c --- gmp-ecm-7.0.4+ds/pm1.c 2016-08-26 08:25:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pm1.c 2022-06-06 14:16:49.000000000 +0000 @@ -322,7 +322,7 @@ } -static void +void print_prob (double B1, const mpz_t B2, unsigned long dF, unsigned long k, int S, const mpz_t go) { @@ -330,7 +330,7 @@ int i; char sep; - outputf (OUTPUT_VERBOSE, "Probability of finding a factor of n digits:\n"); + outputf (OUTPUT_VERBOSE, "Probability of finding a factor of n digits (assuming one exists):\n"); outputf (OUTPUT_VERBOSE, "20\t25\t30\t35\t40\t45\t50\t55\t60\t65\n"); for (i = 20; i <= 65; i += 5) { @@ -389,14 +389,14 @@ if (mpz_cmp_ui (p, 0) == 0) pm1_random_seed (p, N, rng); - + mpz_init_set (B2min, B2min_parm); mpz_init_set (B2, B2_parm); - + /* Set default B2. See ecm.c for comments */ if (ECM_IS_DEFAULT_B2(B2)) mpz_set_d (B2, pow (B1 * PM1FS2_COST, PM1FS2_DEFAULT_B2_EXPONENT)); - + /* set B2min */ if (mpz_sgn (B2min) < 0) mpz_set_d (B2min, B1); @@ -490,29 +490,33 @@ /* Now decide whether to take NTT or non-NTT. Since the non-NTT code uses more memory, we only use it when -no-ntt was given, or when - we can't find good parameters for the NTT code. */ - if (use_ntt == 0 || P_ntt == ECM_ERROR) + we can't find good parameters for the NTT code. + Warning: we only do that when B2 >= B2min. */ + if (mpz_cmp (B2, B2min) >= 0) { - better_params = ¶ms_nontt; - mpz_set (B2min, effB2min_nontt); - mpz_set (B2, effB2_nontt); - use_ntt = 0; + if (use_ntt == 0 || P_ntt == ECM_ERROR) + { + better_params = ¶ms_nontt; + mpz_set (B2min, effB2min_nontt); + mpz_set (B2, effB2_nontt); + use_ntt = 0; + } + else + { + better_params = ¶ms_ntt; + mpz_set (B2min, effB2min_ntt); + mpz_set (B2, effB2_ntt); + use_ntt = 1; + } + + params.P = better_params->P; + params.s_1 = better_params->s_1; + params.s_2 = better_params->s_2; + params.l = better_params->l; + mpz_set (params.m_1, better_params->m_1); + params.file_stem = TreeFilename; + params.file_stem = TreeFilename; } - else - { - better_params = ¶ms_ntt; - mpz_set (B2min, effB2min_ntt); - mpz_set (B2, effB2_ntt); - use_ntt = 1; - } - - params.P = better_params->P; - params.s_1 = better_params->s_1; - params.s_2 = better_params->s_2; - params.l = better_params->l; - mpz_set (params.m_1, better_params->m_1); - params.file_stem = TreeFilename; - params.file_stem = TreeFilename; mpz_clear (params_ntt.m_1); mpz_clear (params_nontt.m_1); diff -Nru gmp-ecm-7.0.4+ds/pm1fs2.c gmp-ecm-7.0.5+ds/pm1fs2.c --- gmp-ecm-7.0.4+ds/pm1fs2.c 2016-08-26 11:59:15.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pm1fs2.c 2022-06-06 14:16:49.000000000 +0000 @@ -1562,17 +1562,18 @@ #pragma omp parallel if (deg > 1000) { mpmod_t modulus_local; - long i; /* OpenMP insists on signed loop iteration var :( */ - +#ifdef _MSC_VER + long i; /* Microsoft C/C++ stuck on OpenMP 2.0 :( */ +#endif mpmod_init_set (modulus_local, modulus); #pragma omp for - for (i = 0; (unsigned long) i <= 2 * deg - 2; i++) + for (i = 0; i <= 2 * deg - 2; i++) mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local); mpmod_clear (modulus_local); } #else - for (i = 0; (unsigned long) i <= 2 * deg - 2; i++) + for (i = 0; i <= 2 * deg - 2; i++) mpres_mul_z_to_z (H[i], Vt, H[i], modulus); #endif @@ -1849,7 +1850,7 @@ if (c == 2UL) { - /* Check it's symmetric (we write c-1 instead of 2 to avoid a + /* Check it's symmetric (we write c-1 instead of 1 to avoid a compiler warning with clang 2.9) */ ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]); V (Qt, Q, set->elem[0], modulus); @@ -1864,7 +1865,7 @@ ASSERT_ALWAYS (c % 2UL == 1UL); ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL); /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials. - Each is symmetric of degree 2*deg, so each has deg+1 coeffients + Each is symmetric of degree 2*deg, so each has deg+1 coefficients in standard basis. */ for (i = 0UL; i < (c - 1UL) / 2UL; i++) { @@ -2221,6 +2222,7 @@ if (thread_nr == 0) outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks); + /* chunklen = ceil (len / nr_chunks) */ chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL; offset = chunklen * (unsigned long) thread_nr; if (offset <= len) @@ -3252,7 +3254,7 @@ mpmod_t modulus, unsigned long tmplen, mpres_t *tmp) { mpz_t abs_e; - unsigned long idx; + size_t idx; ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1); @@ -3265,8 +3267,8 @@ mpz_init (abs_e); mpz_abs (abs_e, e); - idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus mpz_tstbit (abs_e, idx) == 1 */ - ASSERT (mpz_tstbit (abs_e, idx) == 1); + idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus ecm_tstbit (abs_e, idx) == 1 */ + ASSERT (ecm_tstbit (abs_e, idx) == 1); mpres_set (r0, a0, modulus); mpres_set (r1, a1, modulus); @@ -3275,7 +3277,7 @@ { gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus); idx--; - if (mpz_tstbit (abs_e, idx)) + if (ecm_tstbit (abs_e, idx)) gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp); } diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc10.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc10.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc10.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc10.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,565 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc10(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc10 - GLOBL .GSYM_PREFIX`'mulredc10 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc10: - .quad .GSYM_PREFIX`'mulredc10, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc10, 24 - - -C Implements multiplication and REDC for two input numbers of 10 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 10] array, having 10+1 8-byte words -C The tmp array needs 10+1 entries, but tmp[10] is stored in -C r15, so only 10 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc10: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 80 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 64(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 72(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 9 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 72(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc10, .-.GSYM_PREFIX`'mulredc10 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc11.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc11.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc11.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc11.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,606 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc11(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc11 - GLOBL .GSYM_PREFIX`'mulredc11 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc11: - .quad .GSYM_PREFIX`'mulredc11, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc11, 24 - - -C Implements multiplication and REDC for two input numbers of 11 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 11] array, having 11+1 8-byte words -C The tmp array needs 11+1 entries, but tmp[11] is stored in -C r15, so only 11 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc11: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 88 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 72(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 80(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 10 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 80(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc11, .-.GSYM_PREFIX`'mulredc11 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc12.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc12.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc12.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc12.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,647 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc12 - GLOBL .GSYM_PREFIX`'mulredc12 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc12: - .quad .GSYM_PREFIX`'mulredc12, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc12, 24 - - -C Implements multiplication and REDC for two input numbers of 12 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 12] array, having 12+1 8-byte words -C The tmp array needs 12+1 entries, but tmp[12] is stored in -C r15, so only 12 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc12: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 96 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 80(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 88(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 11 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 88(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc12, .-.GSYM_PREFIX`'mulredc12 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc13.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc13.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc13.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc13.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,688 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc13(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc13 - GLOBL .GSYM_PREFIX`'mulredc13 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc13: - .quad .GSYM_PREFIX`'mulredc13, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc13, 24 - - -C Implements multiplication and REDC for two input numbers of 13 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 13] array, having 13+1 8-byte words -C The tmp array needs 13+1 entries, but tmp[13] is stored in -C r15, so only 13 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc13: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 104 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 88(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 96(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 12 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 96(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc13, .-.GSYM_PREFIX`'mulredc13 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc14.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc14.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc14.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc14.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,729 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc14 - GLOBL .GSYM_PREFIX`'mulredc14 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc14: - .quad .GSYM_PREFIX`'mulredc14, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc14, 24 - - -C Implements multiplication and REDC for two input numbers of 14 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 14] array, having 14+1 8-byte words -C The tmp array needs 14+1 entries, but tmp[14] is stored in -C r15, so only 14 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc14: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 112 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 96(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 104(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 13 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 104(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc14, .-.GSYM_PREFIX`'mulredc14 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc15.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc15.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc15.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc15.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,770 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc15 - GLOBL .GSYM_PREFIX`'mulredc15 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc15: - .quad .GSYM_PREFIX`'mulredc15, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc15, 24 - - -C Implements multiplication and REDC for two input numbers of 15 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 15] array, having 15+1 8-byte words -C The tmp array needs 15+1 entries, but tmp[15] is stored in -C r15, so only 15 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc15: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 120 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 14. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 104(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 112(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 14 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13 - - ld r14, 112(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 14. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 104(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 112(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc15, .-.GSYM_PREFIX`'mulredc15 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc16.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc16.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc16.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc16.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,811 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc16(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc16 - GLOBL .GSYM_PREFIX`'mulredc16 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc16: - .quad .GSYM_PREFIX`'mulredc16, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc16, 24 - - -C Implements multiplication and REDC for two input numbers of 16 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 16] array, having 16+1 8-byte words -C The tmp array needs 16+1 entries, but tmp[16] is stored in -C r15, so only 16 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc16: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 128 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 14 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 15. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 112(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 120(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 15 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13 - - ld r14, 112(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 14 - - ld r14, 120(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 15. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 112(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 120(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc16, .-.GSYM_PREFIX`'mulredc16 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc17.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc17.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc17.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc17.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,852 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc17(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc17 - GLOBL .GSYM_PREFIX`'mulredc17 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc17: - .quad .GSYM_PREFIX`'mulredc17, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc17, 24 - - -C Implements multiplication and REDC for two input numbers of 17 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 17] array, having 17+1 8-byte words -C The tmp array needs 17+1 entries, but tmp[17] is stored in -C r15, so only 17 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc17: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 136 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 14 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 15 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 16. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 120(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 128(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 16 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13 - - ld r14, 112(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 14 - - ld r14, 120(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 15 - - ld r14, 128(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 16. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 120(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 128(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc17, .-.GSYM_PREFIX`'mulredc17 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc18.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc18.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc18.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc18.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,893 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc18(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc18 - GLOBL .GSYM_PREFIX`'mulredc18 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc18: - .quad .GSYM_PREFIX`'mulredc18, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc18, 24 - - -C Implements multiplication and REDC for two input numbers of 18 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 18] array, having 18+1 8-byte words -C The tmp array needs 18+1 entries, but tmp[18] is stored in -C r15, so only 18 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc18: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 144 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 14 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 15 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 16 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 136(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 120(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 17. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 136(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 128(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 136(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 17 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13 - - ld r14, 112(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 14 - - ld r14, 120(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 15 - - ld r14, 128(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 16 - - ld r14, 136(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 136(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 120(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 17. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 136(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 128(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 136(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc18, .-.GSYM_PREFIX`'mulredc18 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc19.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc19.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc19.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc19.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,934 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc19(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc19 - GLOBL .GSYM_PREFIX`'mulredc19 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc19: - .quad .GSYM_PREFIX`'mulredc19, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc19, 24 - - -C Implements multiplication and REDC for two input numbers of 19 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 19] array, having 19+1 8-byte words -C The tmp array needs 19+1 entries, but tmp[19] is stored in -C r15, so only 19 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc19: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 152 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 14 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 15 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 16 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 136(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 120(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 17 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 136(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 144(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 128(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 18. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 144(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 136(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 144(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 18 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13 - - ld r14, 112(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 14 - - ld r14, 120(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 15 - - ld r14, 128(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 16 - - ld r14, 136(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 136(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 120(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 17 - - ld r14, 144(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 136(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 144(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 128(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 18. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 144(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 136(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 144(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc19, .-.GSYM_PREFIX`'mulredc19 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc1.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc1.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc1.asm 2016-06-29 13:57:52.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc1.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,65 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y, -C const mp_limb_t m, mp_limb_t inv_m); -C -C arguments: -C r3 : ptr to result z -C r4 : input x -C r5 : input y -C r6 : modulus m' -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc1 - GLOBL .GSYM_PREFIX`'mulredc1 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc1: - .quad .GSYM_PREFIX`'mulredc1, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc1, 24 - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc1: - mulld r8, r4, r5 C x*y low half T0 - mulhdu r9, r4, r5 C x*y high half T1 - mulld r0, r7, r8 C u = t0 * invm - mulld r10, r0, r6 C u*m low - mulhdu r11, r0, r6 C u*m high - addc r8, r8, r10 C x*y + u*m low (= zero) - adde r9, r9, r11 C result - std r9, 0(r3) C store in z - addze r3, r8 C return carry - blr - - .size .GSYM_PREFIX`'mulredc1, .-.GSYM_PREFIX`'mulredc1 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc20.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc20.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc20.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc20.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,975 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc20(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc20 - GLOBL .GSYM_PREFIX`'mulredc20 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc20: - .quad .GSYM_PREFIX`'mulredc20, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc20, 24 - - -C Implements multiplication and REDC for two input numbers of 20 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 20] array, having 20+1 8-byte words -C The tmp array needs 20+1 entries, but tmp[20] is stored in -C r15, so only 20 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc20: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 160 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 9 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 10 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 11 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 12 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 13 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 14 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 15 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 16 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 136(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 120(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 17 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 136(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 144(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 128(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 18 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 144(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 152(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 136(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 19. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 152(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 144(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 152(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 19 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8 - - ld r14, 72(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 72(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 9 - - ld r14, 80(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 72(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 80(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 64(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 10 - - ld r14, 88(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 80(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 88(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 72(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 11 - - ld r14, 96(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 88(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 96(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 80(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 12 - - ld r14, 104(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 96(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 104(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 88(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 13 - - ld r14, 112(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 104(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 112(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 96(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 14 - - ld r14, 120(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 112(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 120(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 104(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 15 - - ld r14, 128(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 120(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 128(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 112(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 16 - - ld r14, 136(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 128(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 136(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 120(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 17 - - ld r14, 144(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 136(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 144(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 128(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 18 - - ld r14, 152(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 144(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 152(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 136(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 19. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 152(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 144(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 152(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc20, .-.GSYM_PREFIX`'mulredc20 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc2.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc2.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc2.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,122 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc2 - GLOBL .GSYM_PREFIX`'mulredc2 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc2: - .quad .GSYM_PREFIX`'mulredc2, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc2, 24 - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc2: - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result zero - mulld r8, r0, r12 C x[0]*y[1] low half - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - - mulhdu r9, r0, r12 C x[0]*y[1] high half - ld r0, 8(r6) C m[1] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[1] low - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulhdu r9, r0, r11 C U*m[1] high - ldu r12, 8(r4) C x[1] - ld r0, 0(r5) C y[0] - addc r13, r8, r13 C add T0 and low word - mulld r8, r0, r12 C x[1]*y[0] low half - adde r14, r9, r14 C add high word with carry to T1 - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - mulhdu r9, r0, r12 C x[1]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - - mulld r8, r0, r12 C x[1]*y[1] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[1]*y[1] high half - ld r0, 8(r6) C m[1] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[1] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[1] high - addc r8, r8, r13 C add T0 and low word - adde r9, r9, r14 C T1, carry pending - std r8, 0(r3) C copy result to z - stdu r9, 8(r3) - - addze r3, r10 C return tmp(len) - ld r16, 0(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc2, .-.GSYM_PREFIX`'mulredc2 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc3.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc3.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc3.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,278 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc3(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc3 - GLOBL .GSYM_PREFIX`'mulredc3 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc3: - .quad .GSYM_PREFIX`'mulredc3, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc3, 24 - - -C Implements multiplication and REDC for two input numbers of 3 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 3] array, having 3+1 8-byte words -C The tmp array needs 3+1 entries, but tmp[3] is stored in -C r15, so only 3 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc3: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 24 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 8(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 16(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 2 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 16(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc3, .-.GSYM_PREFIX`'mulredc3 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc4.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc4.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc4.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc4.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,319 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc4(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc4 - GLOBL .GSYM_PREFIX`'mulredc4 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc4: - .quad .GSYM_PREFIX`'mulredc4, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc4, 24 - - -C Implements multiplication and REDC for two input numbers of 4 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 4] array, having 4+1 8-byte words -C The tmp array needs 4+1 entries, but tmp[4] is stored in -C r15, so only 4 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc4: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 32 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 16(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 24(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 3 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 24(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc4, .-.GSYM_PREFIX`'mulredc4 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc5.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc5.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc5.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc5.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,360 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc5(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc5 - GLOBL .GSYM_PREFIX`'mulredc5 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc5: - .quad .GSYM_PREFIX`'mulredc5, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc5, 24 - - -C Implements multiplication and REDC for two input numbers of 5 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 5] array, having 5+1 8-byte words -C The tmp array needs 5+1 entries, but tmp[5] is stored in -C r15, so only 5 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc5: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 40 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 24(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 32(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 4 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 32(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc5, .-.GSYM_PREFIX`'mulredc5 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc6.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc6.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc6.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc6.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,401 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc6(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc6 - GLOBL .GSYM_PREFIX`'mulredc6 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc6: - .quad .GSYM_PREFIX`'mulredc6, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc6, 24 - - -C Implements multiplication and REDC for two input numbers of 6 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 6] array, having 6+1 8-byte words -C The tmp array needs 6+1 entries, but tmp[6] is stored in -C r15, so only 6 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc6: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 48 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 32(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 40(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 5 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 40(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc6, .-.GSYM_PREFIX`'mulredc6 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc7.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc7.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc7.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc7.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,442 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc7(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc7 - GLOBL .GSYM_PREFIX`'mulredc7 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc7: - .quad .GSYM_PREFIX`'mulredc7, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc7, 24 - - -C Implements multiplication and REDC for two input numbers of 7 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 7] array, having 7+1 8-byte words -C The tmp array needs 7+1 entries, but tmp[7] is stored in -C r15, so only 7 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc7: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 56 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 40(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 48(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 6 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 48(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc7, .-.GSYM_PREFIX`'mulredc7 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc8.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc8.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc8.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc8.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,483 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc8 - GLOBL .GSYM_PREFIX`'mulredc8 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc8: - .quad .GSYM_PREFIX`'mulredc8, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc8, 24 - - -C Implements multiplication and REDC for two input numbers of 8 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 8] array, having 8+1 8-byte words -C The tmp array needs 8+1 entries, but tmp[8] is stored in -C r15, so only 8 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc8: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 64 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 48(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 56(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 7 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 56(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc8, .-.GSYM_PREFIX`'mulredc8 - diff -Nru gmp-ecm-7.0.4+ds/powerpc64/mulredc9.asm gmp-ecm-7.0.5+ds/powerpc64/mulredc9.asm --- gmp-ecm-7.0.4+ds/powerpc64/mulredc9.asm 2016-06-29 13:57:53.000000000 +0000 +++ gmp-ecm-7.0.5+ds/powerpc64/mulredc9.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,524 +0,0 @@ -dnl ****************************************************************************** -dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. -dnl -dnl This file is part of the ECM Library. -dnl -dnl The ECM Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published by -dnl the Free Software Foundation; either version 3 of the License, or (at your -dnl option) any later version. -dnl -dnl The ECM Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the ECM Library; see the file COPYING.LIB. If not, write to -dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -dnl MA 02110-1301, USA. -dnl ****************************************************************************** - -define(C, ` -dnl') - -C mp_limb_t mulredc9(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -C const mp_limb_t *m, mp_limb_t inv_m); -C -C arguments: -C r3 = ptr to result z least significant limb -C r4 = ptr to input x least significant limb -C r5 = ptr to input y least significant limb -C r6 = ptr to modulus m least significant limb -C r7 = -1/m mod 2^64 -C -C final carry returned in r3 - - - -include(`config.m4') - - GLOBL GSYM_PREFIX`'mulredc9 - GLOBL .GSYM_PREFIX`'mulredc9 - - .section ".opd", "aw" - .align 3 -GSYM_PREFIX`'mulredc9: - .quad .GSYM_PREFIX`'mulredc9, .TOC.@tocbase, 0 - .size GSYM_PREFIX`'mulredc9, 24 - - -C Implements multiplication and REDC for two input numbers of 9 words - -C The algorithm: -C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) -C -C T1:T0 = x[i]*y[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; -C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 (see note 2) */ -C for (i = 1; i < len; i++) -C { -C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; -C u = (T0*invm) % 2^64 ; -C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ -C for (j = 1; j < len; j++) -C { -C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; -C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 -C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ -C tmp[j-1] = T0; -C } -C tmp[len-1] = T1 ; -C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ -C } -C z[0 ... len-1] = tmp[0 ... len-1] ; -C return (tmp[len]) ; -C -C notes: -C -C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, -C so cy:T1 <= 2*2^64 - 4. -C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 -C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), -C so cy:T1 <= 2*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), -C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. -C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, -C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) -C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 -C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), -C so cy:T1 <= 3*2^64 - 3. For j > 1, -C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), -C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. -C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. -C Assume this is true for index i-1, Then -C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 -C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 -C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), -C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. -C -C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 -C YP = r5, MP = r6, TP = r1 (stack ptr) -C - -C local variables: tmp[0 ... 9] array, having 9+1 8-byte words -C The tmp array needs 9+1 entries, but tmp[9] is stored in -C r15, so only 9 entries are used in the stack. - - - TEXT - .align 5 C powerPC 32 byte alignment -.GSYM_PREFIX`'mulredc9: - -C ######################################################################## -C # i = 0 pass -C ######################################################################### - -C Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - ld r12, 0(r4) C XI = x[0] - ld r0, 0(r5) C y[0] - stdu r13, -8(r1) C save r13 - mulld r8, r0, r12 C x[0]*y[0] low half - stdu r14, -8(r1) C save r14 - mulhdu r9, r0, r12 C x[0]*y[0] high half - ld r0, 0(r6) C m[0] - mulld r11, r7, r8 C U = T0*invm mod 2^64 - stdu r15, -8(r1) C save r15 - mulld r13, r0, r11 C T0 = U*m[0] low - stdu r16, -8(r1) C save r16 - li r16, 0 C set r16 to zero for carry propagation - subi r1, r1, 72 C set tmp stack space - mulhdu r14, r0, r11 C T1 = U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C - adde r13, r9, r14 C T0 = initial tmp(0) - addze r10, r16 C carry to CY - C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - C CY:T1 <= 2*2^64 - 4 - -C Pass for j = 1 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 2 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 3 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 4 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 5 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 6 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 7 - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C add high word with carry to T1 - addze r10, r16 C carry to CY - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= - C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 - -C Pass for j = 8. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - adde r14, r9, r10 C add high word with carry + CY to T1 - C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! - - mulld r8, r0, r11 C U*m[j] low - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C add high word with carry to T1 - std r8, 56(r1) C store tmp[len-2] - addze r15, r16 C put carry in r15 (tmp[len] <= 1) - std r13, 64(r1) C store tmp[len-1] - - -C ######################################################################### -C # i > 0 passes -C ######################################################################### - - - li r9, 8 C outer loop count - mtctr r9 - -1: - -C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -C and compute the new u - - ldu r12, 8(r4) C x[i] - ld r0, 0(r5) C y[0] - ld r13, 0(r1) C tmp[0] - mulld r8, r0, r12 C x[i]*y[0] low half - ld r14, 8(r1) C tmp[1] - mulhdu r9, r0, r12 C x[i]*y[0] high half - addc r13, r8, r13 C T0 - ld r0, 0(r6) C m[0] - mulld r11, r7, r13 C U = T0*invm mod 2^64 - adde r14, r9, r14 C T1 - mulld r8, r0, r11 C U*m[0] low - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[0] high - ld r0, 8(r5) C y[1] - addc r8, r8, r13 C result = 0 - adde r13, r9, r14 C T0, carry pending - C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, - C so cy:T1 <= 3*2^64 - 4 - -C Pass for j = 1 - - ld r14, 16(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 8(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 16(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 0(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 2 - - ld r14, 24(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 16(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 24(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 8(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 3 - - ld r14, 32(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 24(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 32(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 16(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 4 - - ld r14, 40(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 32(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 40(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 24(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 5 - - ld r14, 48(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 40(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 48(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 32(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 6 - - ld r14, 56(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 48(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 56(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 40(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 7 - - ld r14, 64(r1) C tmp[j+1] - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r14, r10 C tmp[j+1] + CY + pending carry - addze r10, r16 C carry to CY - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 56(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r10 C add carry to CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - ld r0, 64(r5) C y[j+1] - adde r13, r9, r14 C T1, carry pending - std r8, 48(r1) C store tmp[j-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 - C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 - -C Pass for j = 8. Don't fetch new data from y[j+1]. - - mulld r8, r0, r12 C x[i]*y[j] low half - adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry - C since tmp[len] <= 1, T1 <= 3 and carry is zero - mulhdu r9, r0, r12 C x[i]*y[j] high half - ld r0, 64(r6) C m[j] - addc r13, r8, r13 C add low word to T0 - mulld r8, r0, r11 C U*m[j] low - adde r14, r9, r14 C add high to T1 - addze r10, r16 C CY - mulhdu r9, r0, r11 C U*m[j] high - addc r8, r8, r13 C add T0 and low word - adde r13, r9, r14 C T1, carry pending - std r8, 56(r1) C store tmp[len-2] - addze r15, r10 C store tmp[len] <= 1 - std r13, 64(r1) C store tmp[len-1] - C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 - C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) - - bdnz 1b - -C Copy result from tmp memory to z - - ld r8, 0(r1) - ldu r9, 8(r1) - std r8, 0(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - ldu r9, 8(r1) - stdu r8, 8(r3) - stdu r9, 8(r3) - ldu r8, 8(r1) - stdu r8, 8(r3) - - mr r3, r15 C return tmp(len) - ldu r16, 8(r1) - ldu r15, 8(r1) - ldu r14, 8(r1) - ldu r13, 8(r1) - addi r1, r1, 8 - blr - - .size .GSYM_PREFIX`'mulredc9, .-.GSYM_PREFIX`'mulredc9 - diff -Nru gmp-ecm-7.0.4+ds/pp1.c gmp-ecm-7.0.5+ds/pp1.c --- gmp-ecm-7.0.4+ds/pp1.c 2016-02-24 17:15:19.000000000 +0000 +++ gmp-ecm-7.0.5+ds/pp1.c 2022-06-06 14:16:49.000000000 +0000 @@ -61,8 +61,7 @@ static void pp1_mul (mpres_t P1, mpres_t P0, mpz_t e, mpmod_t n, mpres_t P, mpres_t Q) { - mp_size_t size_e; - unsigned long i; + mp_size_t size_e, i; ASSERT (mpz_cmp_ui (e, 1) >= 0); @@ -82,7 +81,7 @@ size_e = mpz_sizeinbase (e, 2); for (i = size_e - 1; i > 0;) { - if (mpz_tstbit (e, --i)) /* k -> 2k+1 */ + if (ecm_tstbit (e, --i)) /* k -> 2k+1 */ { if (i) /* Q is not needed for last iteration */ { @@ -310,9 +309,7 @@ if (mpz_sgn (B2min) < 0) mpz_set_d (B2min, B1); - mpmod_init (modulus, n, repr); - - { + { long P; const unsigned long lmax = 1UL<<28; /* An upper bound */ unsigned long lmax_NTT, lmax_noNTT; @@ -391,7 +388,7 @@ outputf (OUTPUT_VERBOSE, "Using lmax = %lu with%s NTT which takes " "about %luMB of memory\n", faststage2_params.l, s, MB); } - } + } /* Print B1, B2, polynomial and x0 */ print_B1_B2_poly (OUTPUT_NORMAL, ECM_PP1, B1, *B1done, B2min_parm, B2min, @@ -409,6 +406,23 @@ faststage2_params.m_1); } + if (test_verbose (OUTPUT_VERBOSE)) + { + if (mpz_sgn (B2min_parm) >= 0) + { + outputf (OUTPUT_VERBOSE, + "Can't compute success probabilities for B1 <> B2min\n"); + } + else + { + rhoinit (256, 10); + /* If x0 is chosen randomly, the resulting group order will behave, + on average, like for P-1, thus we use the same code as for P-1. */ + print_prob (B1, B2, 0, k, 1, go); + } + } + + mpmod_init (modulus, n, repr); mpres_init (a, modulus); mpres_set_z (a, p, modulus); diff -Nru gmp-ecm-7.0.4+ds/prime95.save gmp-ecm-7.0.5+ds/prime95.save --- gmp-ecm-7.0.4+ds/prime95.save 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/prime95.save 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +[Tue Jan 05 20:39:57 2016] +N=0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF; QX=0x51CA3E785D8808AEAD7BF955E8B6D938BD1554C9B504BFC942CC7376809B4EBA7D16A0B240291B5EC0061D9300289DE283F4F4B33952AA477258D51D7A03F2; SIGMA=3969830600789499 +M503 completed 1 ECM curve, B1=100000, B2=100000, We4: 00270D4E diff -Nru gmp-ecm-7.0.4+ds/README gmp-ecm-7.0.5+ds/README --- gmp-ecm-7.0.4+ds/README 2016-06-13 11:59:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/README 2022-06-06 14:16:49.000000000 +0000 @@ -1,3 +1,4 @@ +[note: this file is outdated now that GMP-ECM moved to gitlab.] This is the README file for GMP-ECM. (See INSTALL-ecm for installing GMP-ECM and the ecm library, and README.lib for using the ecm library.) @@ -14,6 +15,7 @@ 9. GMP-ECM and GPU. 11. Record factors. 11. Known problems. +12. GMP-ECM packages. ############################################################################## @@ -22,7 +24,8 @@ GMP-ECM reads the numbers to be factored from stdin (one number on each line) and requires a numerical parameter, the stage 1 bound B1. A reasonable stage 2 bound B2 for the given B1 is chosen by default, but can be overridden -by a second numerical parameter. By default, GMP-ECM uses the ECM factoring +by a second numerical parameter. (For a given B1 value, the "default" B2 might +differ between ECM, P-1, and P+1.) By default, GMP-ECM uses the ECM factoring algorithm. Example: To run one curve of ECM with B1=1000000 on each number in the file @@ -331,27 +334,24 @@ 2. Comments can be placed in the file. The C++ "one line comment" // is used. Everything after the // on a line (including the //) is ignored. Warning: no input number should appear on such a comment line. -3. Line continuation. If a line ends with a backslash character '\', - it is considered it continues on the next line (ignoring the '\'). -4. Any white space (space, tab, end of line) is ignored. However, the "end of - line" is used to end the expression (unless of course there is a '\' - character before the end of line). For example, processing this: +3. Any white space (space, tab, end of line) is ignored. However, the "end of + line" is used to end the expression. For example, processing this: 1 2 3 4 5 6 7 8 9 would be the same as processing 123456789 -5. "common" arithmetic expressions (* / + - %), the period '.' might be used +4. "common" arithmetic expressions (* / + - %), the period '.' might be used in place of * for multiply, and - can be unary minus (e.g., -55555551). Example: echo "3*5+2" | ./ecm 100 -6. Grouping ( [ { for start of group (which symbol is used does not matter) +5. Grouping ( [ { for start of group (which symbol is used does not matter) and ) ] } to end a group (again all 3 symbols mean the SAME thing). -7. Exponentiation with the ^ character (i.e., 2^24 is the same as 16777216). +6. Exponentiation with the ^ character (i.e., 2^24 is the same as 16777216). Example: echo "2^24+1" | ./ecm 100 -8. Simple factorial using the exclamation point ! character. Example is +7. Simple factorial using the exclamation point ! character. Example is 53! == 1*2*3*4...*52*53. Example: echo '53!+1' | ./ecm 1e2 -9. Multi-factorial as in: n!m with an example: 15!3 == 15.12.9.6.3. -10. Simple Primorial using the # character with example of 11# == 2*3*5*7*11 -11. Reduced Primorial n#m with example of 17#5 == 5.7.11.13.17 -12. Functions are possible with the expression parser. Currently, the only +8. Multi-factorial as in: n!m with an example: 15!3 == 15.12.9.6.3. +9. Simple Primorial using the # character with example of 11# == 2*3*5*7*11 +10. Reduced Primorial n#m with example of 17#5 == 5.7.11.13.17 +11. Functions are possible with the expression parser. Currently, the only available function is Phi(x,n), however other functions should be easy to add in the future. @@ -395,7 +395,7 @@ The -param option is used to choose the parametrization from which the curves are taken. It can take 4 values (s is the 'sigma' parameter): - 0: use Suyama parametrization. It was the parametrization used previous + 0: use Suyama parametrization. It was the parametrization used by previous versions of GMP-ECM. u = s^2-5, v = 4*s, A = (v-u)^3*(3*u+v)/(4*u^3*v)-2, @@ -414,6 +414,13 @@ Generally, the parameter s is chosen randomly but the -sigma option can be used to choose its value. +* with -param 0, there is no constraint on sigma +* with -param 1 (only for 64-bit processors), s^2 should fit in a 64-bit + integer, thus s < 2^32 +* with -param 2, there is no constraint on sigma +* with -param 3 (mostly used for GPU), s should fit in a 32-bit word, + thus s < 2^32 + In the case where only "-sigma s" is used, "-param 0" is assumed in order to be compatible with older version of GMP-ECM. The above is also true when resuming from a file (see Section 7). In the case where only "-param p" is used, the @@ -446,17 +453,31 @@ Applications include using curves with some exotic prescribed torsion subgroup (see d below), or CM (Complex Multiplication) curves. -c) with param = 6: curves in Hessian form ------------------------------------------ +c) with param = 6, 7: curves in (twisted) Hessian form +------------------------------------------------------ + +These curves have torsion group Z3xZ3 over Q(sqrt(-3)) and are useful when +the prime factor p we are looking for is known to satisfy p = 1 mod 3. +Since p = 1 mod 3 is quite frequent, a try at this version is not unthinkable +in general. + +Curves in projective Hessian form have equation + X^3+Y^3+Z^3=D*X*Y*Z. +They should be used with + + -param 6 -A -x0 -y0 -This is to use curves in Hessian form: X^3+Y^3+1=3*A*X*Y, A^3 <> 1 mod N. -These curves have torsion group Z3xZ3 over Q(sqrt(-3)) and are useful when the -prime factor p we are looking for is known to satisfy p = 1 mod 3. It should be -used with - -A -x0 -y0 +with D^3 mod N != 1, and (x0=X0/Z0, y0=Y0/Z0) is a base point on the curve. -where (x0, y0) is a base point on the curve. Since p = 1 mod 3 is quite -frequent, a try at this version is not unthinkable in general. +Curves in projective twisted Hessian form have equation + a*X^3+Y^3+Z^3=d*X*Y*Z. +They should be used with + + -param 7 -A -x0 -y0 + +where D=a^3/d and (x0=X0/Z0, y0=Y0/Z0) is a base point on the curve. + +Ref: JKL-ECM in proceedings ANTS-XII. d) -torsion: ------------ @@ -598,14 +619,17 @@ $ ecm -resume pm1chkpoint 1e10 1 -Note: if an existing file is specified as the checkpoint file, it will be +Note 1: if an existing file is specified as the checkpoint file, it will be silently overwritten! + Note 2: When resuming a checkpoint file, additional small primes may be processed in stage 1 when the checkpoint file is resumed, so the end-of-stage 1 residues of an uninterrupted run and a checkpointed run may not match. The extra primes do not reduce the probability of finding factors, however. +Note 3: for ECM, the -chkpnt option is only implemented with -param 0 so far. + ############################################################################## 8. How to get the best of GMP-ECM? @@ -616,6 +640,10 @@ This will optimize parameters for your machine and put them in ecm-params.h. +If you do an out-of-source build, then: +* copy the new ecm-params.h file into the source folder +* run "make clean" and "make" again + The ecm program automatically selects what it thinks is the best arithmetic for the given input number. If that choice is not optimal, you may force the use of a certain arithmetic by trying options -modmulm, -mpzmod, @@ -655,7 +683,7 @@ 64 bit only work for x86 using gcc or Intel cc, so it is compiler dependent. Note on factoring Fermat numbers: -GMP-ECM features Schönhage-Strassen multiplication for polynomials in +GMP-ECM features Schönhage-Strassen multiplication for polynomials in stage 2 when factoring Fermat numbers (not in the new, fast stage 2 for P+1 and P-1. This is to be implemented.) This greatly reduces the number of modular multiplications required, thus improving speed. It does, however, @@ -665,7 +693,7 @@ For the number of blocks, choices of 2, 3 or 4 usually give best performance. However, if the polynomial degree becomes too large, relatively expensive Karatsuba or Toom-Coom methods are required to split the polynomial before -Schönhage-Strassen's method can handle them. That can make a larger number +Schönhage-Strassen's method can handle them. That can make a larger number of blocks worthwhile. When factoring the m-th Fermat number F_m = 2^(2^m)+1, degrees up to dF=2^(m+1) can be handled directly. If your B2 choice requires a degree much larger than this (dF is printed with the -v @@ -751,3 +779,14 @@ l=2^19:N<2^2910, l=2^20:N<2^1340, l=2^21:N<2^578, l=2^22:N<2^228. Since log(N)*l is approximately constant, this limits the amount of memory that can be used to about 600MB for P-1, and 1200MB for P+1. + +############################################################################## + +12. GMP-ECM packages. + +GMP-ECM is a standard Debian package. See for example +https://packages.debian.org/sid/gmp-ecm. + +An opam package was built by Michael Soegtrop and was tested on MaOS +(including Apple silicon), many Linux variants and Windows (MinGW with cygwin +build host). See https://github.com/ocaml/opam-repository/pull/20105. diff -Nru gmp-ecm-7.0.4+ds/README.dev gmp-ecm-7.0.5+ds/README.dev --- gmp-ecm-7.0.4+ds/README.dev 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/README.dev 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,121 @@ +To use the autotools: + +$ libtoolize +$ autoheader +$ aclocal +$ automake -c -a +$ autoconf +$ ./configure --with-gmp= +$ make +$ make check + +Alternatively, you can simply type: + +$ autoreconf -i +$ ./configure --with-gmp= +$ make +$ make check + +Warning: only configure.ac and Makefile.am should be in cvs + (the other files are automatically generated) + +############################################################################## + +Documentation: + +- ecm.xml was generated from ecm.1 using doclifter-2.1 + ./doclifter < ecm.1 > ecm.xml + (http://en.tldp.org/HOWTO/DocBook-Demystification-HOWTO/x181.html + http://www.catb.org/~esr//doclifter/) + See the documentation of ROX-Filer (http://rox.sf.net/) for example + (see rox/ROX-Filer/src/Docs/Manual.xml). + +- to generate ecm.html: + $ xsltproc -o ecm.html $XSLDIR/html/docbook.xsl ecm.xml + where XSLDIR is the directory where docbook stylesheets are installed + (e.g. /usr/share/sgml/docbook/xsl-stylesheets-1.65.1-1) + +- to generate man format: + $ xsltproc -o ecm.1 $XSLDIR/manpages/docbook.xsl ecm.xml + (or "xmlto man ecm.xml") + +- to generate pdf: + $ xmlto pdf ecm.xml + +Note: we currently keep ecm.1 in the repository, even though it is a +generated file and not a source. This is to avoid having a dependency +on xsltproc and docbook in every build; these tools are not usually +installed by default in distributions. It does not seem to be possible +to set up the Makefile so ecm.1 is required only for "make install" and +"make dist". We need to list ecm.1 in one of the automake _MANS primaries +to get it installed in the proper man page directory, and when it is +listed in any _MANS primary, it is automatically added to the "am-all" +target which is processed by "make" or "make all". + +############################################################################## + +To make a new release: + + 0) Check all tests pass (test.ecm, test.pm1, test.pp1, + testlong.ecm, testlong.pm1, testlong.pp1), + with or without NTT (./test.ecm "./ecm -no-ntt"), + with or without --enable-openmp, + check the issues in TODO, and for issues on gitlab.inria.fr/zimmerma/ecm. + Check no compiler warnings remain (with and without assertions), + if possible with different compilers (gcc, clang, llvm-gcc, ...) + Check "make longcheck" works with --enable-valgrind-tests. + Check that configure works outside the source directory. + Check the default tuning files */params.h (and */*/params.h, see the main + file ecm-params.h) on different platforms with "make ecm-params" + Compare the efficiency wrt the previous release with "./ecm 1e6 < c270" + and "./ecmbench ./ecm". + 1) Check the version (configure.ac and build.vc*/gen_ecm_h.bat). + Switch assertions to off by default in configure.ac. + 2) Update the NEWS file, and check the INSTALL-ecm file is up-to-date. + 3) Update the ChangeLog file, e.g. git log > /tmp/ChangeLog and edit + ChangeLog to insert /tmp/ChangeLog at the beginning and remove + the duplicated lines. + 4) Check the required sizes of P-1, P+1, ECM champions in champions.h. + 5) Generate the release version with "make dist". + 6) Test the release version with "make distcheck" (in particular, check + the man page is correctly installed). You might need to redefine + LD_LIBRARY_PATH so that the system libgmp is used. + 7) Check the man page (ecm.1) is up to date. + Update version-info in Makefile.am if needed. + 8) Check factors are really found using the checkprob and check.sage + programs, for example to estimate the average number of curves to find + a 20-digit prime with B1=11000 and -param 1: + $ ./checkprob "./ecm -param 1" 31622776601683800097 11000 + $ ./checkprob "./ecm -param 0" 3162277660168380112437259 50000 # 25d + and with check.sage, the following command checks the ./ecm binary + for all parametrizations (-param 0-3), with B1=11000 and B2=1873422, + and all sigma values < 1000, and checks the prime 31622776601683800097 + is found when it should be (i.e., when the group order is (B1,B2)-smooth): + check_found_all ("./ecm", 31622776601683800097, 11000, 1873422, 1000) + 9) If there is no problem, add a tag to the version corresponding to the + release, for example: + $ git tag -a 7.0.4 fcc8509 + $ git push + + Version svn tag git tag + 6.0.1 r781 1a2f3c6 + 6.2 r1290 1fb9bfa + 6.2.1 r1307 b600e50 + 6.3 r1501 b3451db + 6.4 r1672 468d182 + 6.4.1 r1844 1a2d08a + 6.4.2 r1882 69fd89d + 6.4.3 r2100 f8c244d + 6.4.4 r2439 2f06a62 + 7.0 r2900 2ffe6f9 + 7.0.1 r2932 cd57d0b + 7.0.2 r2961 4dcc4fc + 7.0.3 r2963 ee145f2 + 7.0.4 r2991 fcc8509 + + 10) increase the version number in configure.ac and build.vc*/gen_ecm_h.bat, + and switch assertions to on by default in configure.ac. + +To see the differences between a given tag and the current version: + +$ git diff 7.0.4 diff -Nru gmp-ecm-7.0.4+ds/README.dev.asm gmp-ecm-7.0.5+ds/README.dev.asm --- gmp-ecm-7.0.4+ds/README.dev.asm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/README.dev.asm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,27 @@ +Architecture-specifc assembly code is stored in different subdirectories. +Currently (March 2013), these are + athlon + pentium4 + powerpc64 + x86_64 + +The code for pentium4 uses MMX/SSE2 instructions, and therefore can not +run on old x86. The code in the 'athlon' subdir is pure i486 and can +therefore be used for any x86 but the asm is optimized for athlon. + +In these subdirs, there is size-specific asm code for combined +multiplication and redc. The sizes are currently 1 to 20 limbs. If +needed, one could go to higher sizes, but is there a need? There is also +a redc function coded in asm (without mul). + +The files are automatically generated using a Python script. This +generation is not done at configure or compile time, to avoid a +dependency to Python. However, the script is given, for developpers to +play with. + +At configure, if asm-redc is enabled, symbolic links are done to the +.asm files in the appropriate directory. + +Note (added March 11, 2013): on most architectures those asm files are not used +any more, since using mpn_mul_n or mpn_sqr and mpn_redc{1,2,n} is faster (see +the bench_mulredc program and the params.h files). diff -Nru gmp-ecm-7.0.4+ds/README.gpu gmp-ecm-7.0.5+ds/README.gpu --- gmp-ecm-7.0.4+ds/README.gpu 2016-08-23 09:22:08.000000000 +0000 +++ gmp-ecm-7.0.5+ds/README.gpu 2022-06-06 14:16:49.000000000 +0000 @@ -1,13 +1,14 @@ This is the README file for GPU version of GMP-ECM. The GPU code will only work with NVIDIA GPU of compute capability greater -than 2.0. +than 3.0. Table of contents of this file 1. How to enable GPU code in GMP-ECM -2. Basic Usage -3. Advanced Usage -4. Known issues +2. How to enable CGBN code in GMP-ECM +3. Basic Usage +4. Advanced Usage +5. Known issues ############################################################################## @@ -20,15 +21,19 @@ $ ./configure --enable-gpu [other options] This will configure the code for NVIDIA GPU for all compute capabilities -between 2.0 and 5.3 known to the nvcc compiler. +between 3.0 and 7.0 known to the nvcc compiler. + +To enable only a single compute capability you can set '--enable-gpu=XX' + + $ ./configure --enable-gpu=61 [other options] By default, GMP-ECM will look for cuda.h in the default header directories, -but you can specify an other directory, such as /opt/cuda, with: +but you can specify another directory, such as /opt/cuda, with: $ ./configure --enable-gpu --with-cuda=/opt/cuda By default, GMP-ECM will look for the nvcc compiler in $PATH, but you can -specify an other directory: +specify another directory: $ ./configure --enable-gpu --with-cuda-bin=/PATH/DIR @@ -37,19 +42,24 @@ $ ./configure --enable-gpu --with-cuda-include=/PATH/DIR By default, GMP-ECM will look for CUDA the default library directories, but you -can specify an other directory: +can specify another directory: $ ./configure --enable-gpu --with-cuda-lib=/PATH/DIR Some versions of CUDA are not compatible with recent versions of gcc. To specify which C compiler is called by the CUDA compiler nvcc, type: - $ ./configure --enable-gpu --with-cuda-compiler=/PATH/DIR - + $ ./configure --enable-gpu --with-cuda-compiler=/PATH/DIR + The value of this parameter is directly passed to nvcc via the option "--compiler-bindir". By default, GMP-ECM lets nvcc choose what C compiler it uses. +If you get errors about "cuda.h: present but cannot be compiled" +Try setting CC to a know good gcc, you may need to use and --with-cuda-compiler + + $ ./configure --enable-gpu CC=gcc-8 + Then, to compile the code, type: $ make @@ -58,9 +68,27 @@ $ make check +Additional randomized checks can be run with + + $ sage check_gpuecm.sage ./ecm + +For failing kernels some additional information may be present in cuda-memcheck + + $ echo "(2^997-1)" | cuda-memcheck ./ecm -cgbn -gpucurves 4096 -v 16000 0 + ############################################################################## -2. Basic Usage +2. How to enable CGBN code in GMP-ECM + +By default the CGBN code is not enabled during GPU builds, follow the +instructions above but add the "-with-cgbn-include" argument to point at +the CGBN include directory (.../CGBN/include/cgbn). + + $ ./configure --enable-gpu --with-cgbn-include=/PATH/DIR/CGBN/include/cgbn + +############################################################################## + +3. Basic Usage To use your GPU for step 1, just add the -gpu option: @@ -69,7 +97,13 @@ It will compute step 1 on the GPU, and then perform step 2 on the CPU (not in parallel). -The only parametrization compatible with GPU code is "-param 3". +If you configured CGBN you can additionall pass the -cgbn option: + + $ echo "(2^835+1)/33" | ./ecm -gpu -cgbn 1e4 + +This is generally faster than the default CUDA code. + +The only parametrization compatible with GPU code is "-param 3". You can save the end of step 1 with "-save" and then load the file to execute step 2. But you cannot resume to continue step 1 with a bigger B1. @@ -79,7 +113,7 @@ ############################################################################## -3. Advanced Usage +4. Advanced Usage The option "-gpudevice n" forces the GPU code to be executed on device n. Nvidia tool "nvidia-smi" can be used to know to which number is associated a GPU. @@ -99,11 +133,12 @@ ############################################################################## -4. Known issues +5. Known issues -On some configurations (GTX 570 with compute capability 2.0 for example) -one gets the Cuda error "too many resources requested for launch". This -can be solved by decreasing ECM_GPU_CURVES_BY_BLOCK from 32 to 16 in ecm-gpu.h. +If you get "Error msg: forward compatibility was attempted on non supported HW" +or "error: 'cuda.h' and 'cudart' library have different versions", then you +can look at https://stackoverflow.com/questions/43022843/nvidia-nvml-driver-library-version-mismatch/45319156#45319156. +In general the best solution is to restart the machine. ############################################################################## diff -Nru gmp-ecm-7.0.4+ds/resume.c gmp-ecm-7.0.5+ds/resume.c --- gmp-ecm-7.0.4+ds/resume.c 2016-06-16 08:24:39.000000000 +0000 +++ gmp-ecm-7.0.5+ds/resume.c 2022-06-06 14:16:49.000000000 +0000 @@ -369,8 +369,8 @@ mpz_mul_ui (checksum, checksum, (*param+1)%CHKSUMMOD); if (mpz_fdiv_ui (checksum, CHKSUMMOD) != saved_checksum) { - fprintf (stderr, "Resume file line has bad checksum %u, expected %lu\n", - saved_checksum, mpz_fdiv_ui (checksum, CHKSUMMOD)); + fprintf (stderr, "Resume file line has bad checksum %u, expected %u\n", + saved_checksum, (unsigned int) mpz_fdiv_ui (checksum, CHKSUMMOD)); mpz_clear (checksum); continue; } @@ -440,10 +440,10 @@ else fprintf (file, "; ETYPE=%d; A=", Etype); - mpz_out_str (file, 10, sigma); - mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (sigma, CHKSUMMOD)); - if (param != ECM_PARAM_DEFAULT) - mpz_mul_ui (checksum, checksum, (param+1)%CHKSUMMOD); + mpz_out_str (file, 10, sigma); + mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (sigma, CHKSUMMOD)); + if (param != ECM_PARAM_DEFAULT) + mpz_mul_ui (checksum, checksum, (param+1)%CHKSUMMOD); } fprintf (file, "; B1=%.0f; N=", B1); @@ -455,8 +455,8 @@ mpz_out_str (file, 16, x); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (n->n, CHKSUMMOD)); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (x, CHKSUMMOD)); - fprintf (file, "; CHECKSUM=%lu; PROGRAM=GMP-ECM %s;", - mpz_fdiv_ui (checksum, CHKSUMMOD), VERSION); + fprintf (file, "; CHECKSUM=%u; PROGRAM=GMP-ECM %s;", + (unsigned int) mpz_fdiv_ui (checksum, CHKSUMMOD), VERSION); mpz_clear (checksum); if (y != NULL) { @@ -532,11 +532,7 @@ } /* Call write_resumefile_line for each residue in x. - x = x0 + x1*N + ... + xk*N^k, xi are the residues (this is a hack for GPU) - FIXME : x0 corresponds to sigma + gpu_curves-1 - xk corresponds to sigma - should be the other way around - + x = x0 + x1*2^(bits) + ... + xk*2^(bits*k), xi are the residues (this is a hack for GPU) Returns 1 on success, 0 on error */ int write_resumefile (char *fn, int method, mpz_t N, ecm_params params, @@ -621,19 +617,22 @@ comment); } } - else + else /* gpu case */ { - mpz_add_ui (params->sigma, params->sigma, params->gpu_number_of_curves); + size_t n_bits = mpz_sizeinbase(N, 2); for (i = 0; i < params->gpu_number_of_curves; i++) { - mpz_sub_ui (params->sigma, params->sigma, 1); - mpz_fdiv_qr (params->x, tmp_x, params->x, N); + mpz_fdiv_r_2exp (tmp_x, params->x, n_bits); + mpz_fdiv_q_2exp (params->x, params->x, n_bits); mpz_mod (tmp_x, tmp_x, n->n); write_resumefile_line (file, method, params->B1done, params->sigma, params->sigma_is_A, params->E->type, - params->param, + /* since the gpu version always uses -param 3, + we hardcode it in the save file */ + ECM_PARAM_BATCH_32BITS_D, tmp_x, NULL, n, orig_x0, orig_y0, comment); + mpz_add_ui (params->sigma, params->sigma, 1); } } diff -Nru gmp-ecm-7.0.4+ds/rho.c gmp-ecm-7.0.5+ds/rho.c --- gmp-ecm-7.0.4+ds/rho.c 2015-02-11 08:34:48.000000000 +0000 +++ gmp-ecm-7.0.5+ds/rho.c 2022-06-06 14:16:49.000000000 +0000 @@ -34,7 +34,7 @@ #include #if defined(TESTDRIVE) #include -#include "primegen.h" +#include #endif #if defined(TESTDRIVE) #include @@ -171,13 +171,13 @@ } -/* The number of positive integers up to x that have no prime factor up to y, +/* The number of positive integers up to x that have no prime factor <= y, for x >= y >= 2. Uses Buchstab's identity */ unsigned long Buchstab_Phi(unsigned long x, unsigned long y) { unsigned long p, s; - primegen pg[1]; + primesieve_iterator pg[1]; if (x < 1) return 0; @@ -189,9 +189,9 @@ #endif s = 1; - primegen_init (pg); - primegen_skipto (pg, y + 1); - for (p = primegen_next(pg); p <= x; p = primegen_next(pg)) + primesieve_init (pg); + primesieve_skipto (pg, y, x+1); + for (p = primesieve_next_prime (pg); p <= x; p = primesieve_next_prime (pg)) s += Buchstab_Phi(x / p, p - 1); return (s); } @@ -203,7 +203,7 @@ Buchstab_Psi(const unsigned long x, const unsigned long y) { unsigned long r, p; - primegen pg[1]; + primesieve_iterator pg[1]; if (x <= y) return (x); @@ -234,8 +234,8 @@ } r = 1; - primegen_init (pg); - for (p = primegen_next(pg); p <= y; p = primegen_next(pg)) + primesieve_init (pg); + for (p = primesieve_next_prime (pg); p <= y; p = primesieve_next_prime (pg)) r += Buchstab_Psi (x / p, p); return (r); } @@ -443,6 +443,11 @@ } #endif +/* return the value of the "local" Dickman rho function, for numbers near x + (as opposed to numbers <= x for the original Dickman rho function). + Reference: PhD thesis of Alexander Kruppa, + http://docnum.univ-lorraine.fr/public/SCD_T_2010_0054_KRUPPA.pdf, + equation (5.6) page 100 */ static double dickmanlocal (double alpha, double x) { @@ -493,22 +498,27 @@ return ((primemap[n / 30] & (1 << r)) != 0); } +/* return the sum in Equation (5.10) page 102 of Alexander Kruppa's + PhD thesis */ static double dickmanmu_sum (const unsigned long B1, const unsigned long B2, const double x) { double s = 0.; - const double logB1 = 1. / log(B1); + const double inv_logB1 = 1. / log(B1); const double logx = log(x); unsigned long p; for (p = B1 + 1; p <= B2; p++) if (isprime(p)) - s += dickmanlocal ((logx - log(p)) * logB1, x / p) / p; + s += dickmanlocal ((logx - log(p)) * inv_logB1, x / p) / p; - return (s); + return s; } +/* return the probability that a number < x has its 2nd largest prime factor + less than x^(1/alpha) and its largest prime factor less than x^(beta/alpha) +*/ static double dickmanmu (double alpha, double beta, double x) { @@ -645,6 +655,10 @@ return prob (B1, B2, N, nr, S, ECM_EXTRA_SMOOTHNESS); } +/* see Willemien Ekkelkamp's Phd thesis: + https://openaccess.leidenuniv.nl/bitstream/handle/1887/14567/proefschrift_041109.pdf?sequence=2: + Bach-Peralta formula page 12 + Corollary 4 page 18 with a 2nd-order term */ double pm1prob (double B1, double B2, double N, double nr, int S, const mpz_t go) { @@ -654,11 +668,16 @@ account by the "smoothness" value here; a prime p-1 is about as likely smooth as a random number around (p-1)/exp(smoothness). smoothness = \sum_{q in Primes} log(q)/(q-1)^2 */ + /* Note that this routine is also called for P+1, where we assume the same + behaviour as with P-1. However, if x0=6/5, Kruppa writes in his PhD + thesis that we get smoothness = 1.92012; with x0=2/7, we get + smoothness = 2.05093. */ double smoothness = 1.2269688; unsigned long i; if (go != NULL && mpz_cmp_ui (go, 1UL) > 0) { + double res; mpz_init (cof); mpz_set (cof, go); for (i = 2; i < 100; i++) @@ -673,8 +692,9 @@ } /* printf ("pm1prob: smoothness after dividing out go primes < 100: %f\n", smoothness); */ - return prob (B1, B2, N, nr, S, smoothness + log(mpz_get_d (cof))); + res = prob (B1, B2, N, nr, S, smoothness + log(mpz_get_d (cof))); mpz_clear (cof); + return res; } return prob (B1, B2, N, nr, S, smoothness); @@ -842,11 +862,11 @@ double B1, B2, N, nr, r, m; int S; unsigned long p, i, pi; - primegen pg[1]; + primesieve_iterator pg[1]; - primegen_init (pg); + primesieve_init (pg); i = pi = 0; - for (p = primegen_next (pg); p <= PRIME_PI_MAX; p = primegen_next (pg)) + for (p = primesieve_next_prime (pg); p <= PRIME_PI_MAX; p = primesieve_next_prime (pg)) { for ( ; i < p; i++) prime_pi[PRIME_PI_MAP(i)] = pi; @@ -858,7 +878,17 @@ if (argc < 2) { - printf ("Usage: rho [ ]\n"); + printf ("Usage: rho [ ]\n\n\n"); + printf (" Calculate the probability of ECM/P-1 finding a factor near N\n" + " with B1/B2, evaluating nr random distinct points in stage 2,\n" + " with a degree -S Dickson polynomial (if S < 0) or\n" + " S'th power as the Brent-Suyama function\n\n"); + printf (" B1 limit.\n"); + printf (" B2 limit.\n"); + printf (" N of similiar size, or number of bits in factor (if < 50).\n"); + printf (" Number of random points evaluated in stage 2.\n"); + printf (" Degree of Brent-Suyama polynomial in stage 2.\n"); + printf (" [ ] Limit P-1 to primes p == r (mod m).\n"); return 1; } diff -Nru gmp-ecm-7.0.4+ds/stage2.c gmp-ecm-7.0.5+ds/stage2.c --- gmp-ecm-7.0.4+ds/stage2.c 2016-02-25 08:04:36.000000000 +0000 +++ gmp-ecm-7.0.5+ds/stage2.c 2022-06-06 14:16:49.000000000 +0000 @@ -309,7 +309,7 @@ } /* Input: X is the point at end of stage 1 - n is the number to factor + modulus contains the number to factor B2min-B2 is the stage 2 range (we consider B2min is done) k0 is the number of blocks (if 0, use default) S is the exponent for Brent-Suyama's extension @@ -325,8 +325,8 @@ */ int stage2 (mpz_t f, void *X, mpmod_t modulus, unsigned long dF, unsigned long k, - root_params_t *root_params, int use_ntt, char *TreeFilename, - int (*stop_asap)(void)) + root_params_t *root_params, int use_ntt, char *TreeFilename, + unsigned int curve_number, int (*stop_asap)(void)) { unsigned long i, sizeT; mpz_t n; @@ -771,6 +771,9 @@ if (stop_asap == NULL || !(*stop_asap)()) { st0 = elltime (st0, cputime ()); + if (curve_number > 0) { + outputf (OUTPUT_NORMAL, "Curve %d ", curve_number); + } outputf (OUTPUT_NORMAL, "Step 2 took %ldms\n", st0); } diff -Nru gmp-ecm-7.0.4+ds/techdocs/buildpoly.tex gmp-ecm-7.0.5+ds/techdocs/buildpoly.tex --- gmp-ecm-7.0.4+ds/techdocs/buildpoly.tex 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/techdocs/buildpoly.tex 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,188 @@ +\documentclass{article} +\usepackage{amsmath,amssymb} +\begin{document} +\newcommand{\Z}{\mathbb{Z}} +\newcommand{\Zn}[1]{(\Z/{#1}\Z)^{*}} + +\title{Building polynomials with roots $r^k$, $k\perp n$} +\author{A. Kruppa} +\maketitle + + +\begin{abstract} +This note presents some of the basic ideas underlying the scheme for building +a reciprocal Laurent polynomial from its roots as described by Montgomery +\cite{Montgomery2007}. +\end{abstract} + + +\section{The set of integers coprime to $n$} +Let $\Zn{n} = \{k + n\Z, k \perp n\}$ +\footnote{The notation $a\perp b$ for ``$a$ is relatively prime to $b$'' + follows the suggestion in \cite[4.5]{Graham_Knuth_Patashnik}} +be the set of residue classes coprime to $n$. +Let $A + B$ denote the set of sums, $A + B = \{a+b, a\in A, b\in B\}$. + +Then we have, for $l \perp m$, +\begin{equation}\label{Zn_sum} +\Zn{lm} = l\Zn{m} + m\Zn{l}. +\end{equation} +and +\begin{equation} +\Zn{p^k} = \Zn{p} + \sum_{i=1}^{k-1} p^i (\Z/p\Z). +\end{equation} + +By definition of the Euler totient function $\varphi$, $|\Z/n\Z| = \varphi(n)$. + +\subsection{Positive representatives} +Let $\bar{R}_n = \{1 \leq k < n, k \perp n\}$ be the set of smallest +positive integers coprime to $n$ and less than $n$, i.e. the smallest +positive representatives of $\Zn{n}$. +Unfortunately, (\ref{Zn_sum}) does not immediately carry over to +$\bar{R}_{lm}$, as $l\bar{R}_m + m\bar{R}_l$ contains integers $\geq lm$. +Since $n-1 \in \bar{R}_n$, we get +$l(m-1) + m(l-1) = 2lm-l-m \geq lm$ for $l,m > 1$. For example, +$\bar{R}_2 = \{1\}$, $\bar{R}_3 = \{1,2\}$, and +$3\bar{R}_2 + 2\bar{R}_3 = \{5, 7\} \neq \{1, 5\} = \bar{R}_6$. + +% Not really important +% With $1, n-1 \in \bar{R}_n$, we have +% $a \in l\bar{R}_m + m\bar{R}_l \Rightarrow l+m \leq a \leq 2lm - l - m$. +% Hence, $l\bar{R}_m + m\bar{R}_l$ lies on an interval of length +% $2(lm - l - m)$. + +\subsection{Representatives symmetric around $0$} +Let $\hat{R}_n = \{|k| \leq (n-1)/2, k \perp n\}$ be the +set of integers of smallest absolute value that are coprime to $n$, +for $n > 2$. As for $\bar{R}_n$, (\ref{Zn_sum}) does not carry over, +as for example $3\hat{S}_2 + 2\hat{S}_3 = \{-1,7\} \neq \{-1,1\}$. + +% For odd $l,m,n$, since $-\frac{n-1}{2},\frac{n-1}{2} \in \hat{S}_n$, +% $a \in l\hat{S}_m + m\hat{S}_l \Rightarrow |a| \leq lm-\frac{l+m}{2}$, +% covering an interval of length $2lm - l - m$. + +For $p$ prime, $p \equiv 3 \pmod{4}$, $\hat{R}_p$ can be factored by +$r=\frac{p+1}{4}$, $\hat{R}_p = \{ -r, r\} + \{-r+1, \ldots, r-1\}$. +The elements of the second set form an arithmetic progression and +so it can be factored again if its cardinality is composite. + +\pagebreak[1] + +Montgomery suggests sets with elements symmetric around $0$ and of +even difference, i.e. $\tilde{R}_n = \{2i-n \perp n, 1\leq i \leq n-1 \}$. +The advantage is that with $p$ prime, the elements of $\tilde{R}_p$ always +form an arithmetic progression which can be factored as a set of sums, if +the cardinality of the set is composite. +% by $\tilde{S}_{lm} = m\tilde{S}_l + \tilde{S}_m$. +The factors are again arithmetic progressions, so $\tilde{R}_p$ +can always be factored into sets of prime cardinality. +The disadvantage is that $\tilde{R}_p$ covers an interval of length +$2p-4$, about twice as large as $p-2$ for $\bar{R}_p$ or $p-1$ for +$\hat{R}_p$. + +\subsection{Combining several sets of sums} + +For composite, squarefree $n$, we can write +\begin{equation}\label{R_longsum} +R_n \equiv \sum_{p\mid n} \frac{n}{p}R_p \pmod{n} +\end{equation} +where $R_n$ represents any of $\bar{R}_n$, $\hat{R}_n$, +$\tilde{R}_n$ or similar choice of particular representatives of +the residue classes of $\Zn{n}$. +If the the elements of $R_p$ are bounded below by $\alpha p$ and above by +$\beta p$, we clearly have lower and upper bounds of the elements of the RHS +of (\ref{R_longsum}) of $\alpha n\nu(n)$ and $\beta n \nu(n)$, respectively, +where $\nu(n)$ is the number of prime divisors in $n$. +This bound grows only linearly in $\alpha$ and $\beta$, so choosing, i.e., +$\tilde{R}_n$ over $\hat{R}_n$ at most doubles the length of the interval +covered by the elements of the set of sums. + +\section{A polynomial with roots $r^k$, $k\perp n$} + +Let +\begin{displaymath} +F_{n,r}(x) = \prod_{k \in S_n} (x-r^k). +\end{displaymath} + +Then, with $p \perp n$, +\begin{eqnarray*} +F_{pn,r}(x) & = & \prod_{k \in S_{pn}} \left(x-r^k\right) \\ + & = & \prod_{k \in pS_n + nS_p} \left(x-r^k\right) \\ + & = & \prod_{i \in nS_p} \prod_{j \in pS_n} \left(x-r^{i+j}\right) \\ + & = & \prod_{i \in nS_p} \prod_{j \in S_n} \left(x-r^{i+pj}\right) \\ + & = & \prod_{i \in nS_p} \prod_{j \in S_n} \left(r^i \left(\frac{x}{r^i}-r^{pj}\right)\right) \\ + & = & \prod_{i \in nS_p} r^{i\varphi(n)} \prod_{j \in S_n} \left(\frac{x}{r^i}-r^{pj}\right) \\ + & = & \prod_{i \in nS_p} r^{i\varphi(n)} F_{n,r^p}\left(\frac{x}{r^i}\right) +\end{eqnarray*} + +If $F_{n,r}(x) = \sum_{k=0}^{\varphi(n)} f_k x^k$, then +\begin{eqnarray*} +r^{i\varphi(n)} F_{n,r}\left(\frac{x}{r^i}\right) & = &\\ +\sum_{k=0}^{\varphi(n)} r^{i\varphi(n)} f_k x^k r^{-ik} & = & \\ +\sum_{k=0}^{\varphi(n)} f_k x^k r^{i(\varphi(n)-k)} && +\end{eqnarray*} +so we can generate $r^{i\varphi(n)} F_{n,r}\left(\frac{x}{r^i}\right)$ from +$F_{n,r}(x)$ by multiplying the coefficients from highest to lowest by powers +of $r$ in an increasing geometric progression, using $2$ multiplications +per coefficient. This means that to produce $F_{pn,r}(x)$, we need to compute +$F_{n,r^p}(x)$ only once. + +\pagebreak[4] + +\section {Converting a polynomial from base $U_i(Y)$ to $V_i(Y)$} + +In section 8.1, Montgomery proposes building a polynomial +$H(Y)=\sum_{j=1}^{n} h_j U_j(Y)$, +$Y=X+1/X$, in the $U_j(Y) = (X^j - 1/X^j)/(X - 1/X)$ basis and converting it +to $\hat{H}(Y)=\hat{h}_0 + \sum_{j=1}^{n} \hat{h}_j V_j(Y)$ +in the $V_j(Y) = X^j + 1/X^j$ +basis in a separate step, using the identity +$U_j(Y) = V_{j-1}(Y) + U_{j-2}(Y)$, $j\geq 2$, as well as $U_0(Y) = 0$, +$U_1(Y) = 1$. +Hence we have +\begin{eqnarray*} + H(Y) & = & \sum_{j=1}^{n} h_j U_j(Y) \\ + & = & h_1 + \sum_{j=2}^{n} h_j U_j(Y) \\ + & = & h_1 + \sum_{j=2}^{n} h_j (V_{j-1}(Y) + U_{j-2}(Y)) \\ +% & = & h_1 + \sum_{j=1}^{n-1} h_{j+1} V_{j}(Y) + \sum_{j=0}^{n-2} h_{j+2} U_{j}(Y) \\ +\end{eqnarray*} +so we can initialise $\hat{H}(Y) = 0$ and compute +\begin{eqnarray*} +\hat{H}(Y) & := & \hat{H}(Y) + h_i V_{i-1}(Y) \\ +H(Y) & := & H(Y) + h_i U_{i-2}(Y) \\ +H(Y) & := & H(Y) - h_i U_i(Y), +\end{eqnarray*} +which leaves $H(Y) + \hat{H}(Y)$ invariant, for $i = d, \ldots, 2$. Then we +have $H(Y) = h_1 V_1(Y) = h_1$ left and can set +\begin{eqnarray*} +\hat{h}_0 & := & \hat{h}_0 + h_1 \\ +h_1 & := & 0 +\end{eqnarray*} +which again leaves $H(Y) + \hat{H}(Y)$ invariant. We now have $H(Y) = 0$, so +$\hat{H}(Y)$ is equal to the original $H(Y)$, but is represented in standard +basis. + +Since both $h_i$ and $\hat{h_i}$ are accessed in descending order, they can +overlap. We can have $h_i$ and $\hat{h}_{i-1}$ in the same memory, so the +update simplifies from +\begin{eqnarray*} + \hat{h}_{i-1} & := & h_{i} \\ + h_{i-2} & := & h_{i-2} + h_i \\ + h_i & := & h_i - h_i +\end{eqnarray*} +to +\begin{displaymath} + h_{i-2} := h_{i-2} + h_i, +\end{displaymath} +for $i = d, \ldots, 2$, and the assignment $\hat{h}_0 := h_1$ becomes a no-op. + +\begin{thebibliography}{} +\bibitem{Montgomery2007} Peter-Lawrence Montgomery: +{\it Record Factorizations (I Hope!) Using P-1/FFT. Draft February 8, 2007}. +Unpublished manuscript. +\bibitem{Graham_Knuth_Patashnik}Graham, Knuth, and Patashnik: +{\it Concrete Mathematics}. Second edition. 1989. Addison Wesley +\end{thebibliography} + +\end{document} + diff -Nru gmp-ecm-7.0.4+ds/techdocs/convolv.tex gmp-ecm-7.0.5+ds/techdocs/convolv.tex --- gmp-ecm-7.0.4+ds/techdocs/convolv.tex 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/techdocs/convolv.tex 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,152 @@ +\documentclass{article} +\usepackage{amsmath} +\begin{document} + +\title{Polynomial multipoint evaluation along a geometric progression} +\author{A. Kruppa} +\maketitle + +Let +\begin{displaymath} +F(x) = \sum_{j=0}^d f_j x^j +\end{displaymath} + +To evaluate $F$ on $n$ points in geometric progression, i.e. +$F(1), F(q), F(q^2), \ldots ,F(q^{n-1})$, we can do a convolution of the +polynomials +\begin{eqnarray*} +B(x) & = & \sum_{j=0}^{l - 1} b_j x^j \\ + & & \textrm{with } b_j = q^{b(j)}\\ +C(x) & = & \sum_{j=0}^d c_j x^j \\ + & & \textrm{with } c_j = f_j q^{c(j)} +\end{eqnarray*} +where $l = n + d$ and get the product +\begin{displaymath} +P(x) = B(x)C(x) = \sum_{i=0}^{l+d-1} p_i x^i +\end{displaymath} +with +\begin{eqnarray*} +p_i & = & \sum_{\substack{0\leq j\leq d,\\0\leq i-j < l}} b_{i-j} c_{j} \\ + & = & \sum_{\substack{0\leq j\leq d,\\0\leq i-j < l}} f_j q^{b(i-j)} q^{c(j)}. +\end{eqnarray*} +If $d \leq i < l$ and $0\leq j\leq d$, $0 \leq i-j < l$, +so the condition on $j$ suffices for this range of $i$: +\begin{eqnarray*} +p_i & = & \sum_{0\leq j\leq d} f_j q^{b(i-j) + c(j)}, \textrm{ for } d \leq i < l. +\end{eqnarray*} + +Since we only want the coefficients $p_i$ for $d\leq i < l$, it is +permissible to compute $B(x)C(x) \% (x^l-1)$, i.e. by a length $l$ cyclic +convolution product. This way, the coefficients +$p_i$, $l \leq i < l + d$ will overlap with the coefficients $p_i$, +$0 \leq i < d$, but we aren't interested in any of them. + + +\subsection{Simple progression} +Now we would like +\begin{displaymath} +q^{h(i)} p_{i+d} = \sum_{j=0}^{d} f_j q^{ji} +\end{displaymath} +so that $q^{h(i)} p_{i+d} = F(q^i)$, for $0 \leq i < n$. +% +Hence we can equate the exponents +\begin{displaymath} + b(i+d-j) + c(j) + h(i) = ij. +\end{displaymath} +If $b(x)$ is a polynomial, it must be of degree at least $2$ to produce the +$ij$ term. Let $b(x)=-(x-d)^2/2$. Then +\begin{eqnarray*} + -i^2/2 + ji - j^2/2 + c(j) + h(i) & = & ij \\ + -i^2/2 - j^2/2 + c(j) + h(i) & = & 0 +\end{eqnarray*} +so with $c(x) = h(x) = x^2/2$, the equality is satisfied. + +The values of $f(x)=-(x-d)^2/2$ are symmetric around $x=d$, so only $d+1$ of +the $b_j = q^{f(j)}, 0 \leq j \leq d+n$ need to be computed. For example, if +$n = d$, we +can start at $j=d$ and work up to $j=2d$. Computing $q^{f(j)}$ for +successive $j$ is done by $f(j+1) - f(j) = -j -1/2 + d$, +so $b_{j + 1} = b_j \cdot q^{-j -1/2 + d}$ which for $d \leq j \leq 2d$ +requires the sequence $b'_{j'} = q^{-j' -1/2}$ for $0 \leq j' \leq d$. + +The values $q^{g(j)}, 0 \leq j \leq d,$ for the $c_j$ sequence can be computed +for sucessive values of $j$ as well, but it is profitable to do it in reverse +order, starting at $j = d$. Then we have +$q^{g(j - 1)} = q^{g(j)} \cdot q^{-j + 1/2}$, and the $q^{-j + 1/2}$ values +are identical to those computed for the $c'_{j'}$ sequence when $j = j' + 1$, +so we can reuse all the $c'_{j'}$ except for $j' = d$ and only need to compute +$q^{1/2}$ afresh. This way, computing all $c_j$ for $0 \leq j \leq d$ +costs $d - 1 + o(d)$ multiplications for the $q^{g(j)}$ values, plus $d$ for +the multiplication with $f_i$, for a total of $2d + o(d)$. + +For the P-1 factoring algorithm, the $q^{h(i)}$ values need not be computed +as $q \perp N$ ($\perp$ meaning coprime) and so +$q^{h(i)} \perp N$, and $\gcd(F(q^i) q^{h(i)}, N) = \gcd(F(q^i), N)$. +Hence the total cost for the multipoint evaluation is $4d + o(d)$ +multiplications and a convolution of length $l$. + + +\subsection{More general progression} +To evaluate $F(q^{\alpha}), F(q^{\alpha+\beta}), ..., +F(q^{\alpha+(n-1)\beta})$, we would like instead +\begin{displaymath} +q^{h(i)} g_{i+d} = \sum_{j=0}^{d} f_j q^{j(\alpha+i\beta)} +\end{displaymath} +and thus +\begin{displaymath} + f(i+d-j) + g(j) + h(i) = \beta ij + \alpha j +\end{displaymath} +Setting $f(x)=-\beta (x-d)^2/2$, we get +\begin{eqnarray*} +-\beta i^2/2 + \beta ij - \beta j^2/2 + g(j) + h(i) & = & \beta ij + \alpha j\\ +-\beta i^2/2 - \beta j^2/2 + g(j) + h(i)& = & \alpha j +\end{eqnarray*} +% +which admits a solution with $g(x) = \beta x^2/2 + \alpha x$ and +$h(x) = \beta x^2/2$. The $f(x)$ values are still symmetric around $x=d$ but +due to the $\alpha$ term in $g(x - 1) - g(x) = -\beta x + \beta/2 - \alpha$, +we cannot reuse the $b'_{j'}$ sequence as before and computing the $c_j$ values +requires $3$ multiplies each, for a total cost for the multipoint evaluation +of $5d + o(d)$ multiplications and a convolution of length $l$. + + +\subsection{Progression in Montgomery's fast P-1 stage 2} +Montgomery wants to evaluate, in his notation, $f(X)$ with +$X=b_1^{2k_2 + (2m+1)P}$, for $m_1 \leq m \leq m_2$. In our notation, +this means $\alpha = 2k_2 + (2m_1+1)P$ and $\beta = 2P$. Further, he requires +that the sequences $b_j$ and $c_j$ are symmetric so that by suitable shifting +by $s, t$ we have $b_{j+s} = b_{-(j+s)}$ and $c_{j+t} = c_{-(j+t)}$. +Let $l = n + d$. + +He sets $f(x)=\alpha (n-1+d/2-x)+\beta(n-1+d/2-x)^2/2$ and +$g(x)=-\beta(x-d/2)^2/2$ and $h(x) = -\beta x^2/2$. +This way, +\begin{eqnarray*} +g_{2d-1-i} & = & \sum_{0\leq j\leq d} b_{i-j} c_j \\ + & = & \sum_{0\leq j\leq d} q^{f(i-j)} q^{g(j)} f_j +\end{eqnarray*} + + +\subsection{What about exponents that are powers} +If we want $F(q^{i^S})$, $S>1$, we'll need $f(x)$, $g(x)$ and $h(x)$ so that +\begin{displaymath} + f(i+d-j) + g(j) = h(i) + i^{S}j +\end{displaymath} +In order to get the $i^{S}j$ term, $f(x)$ must have degree at least $S+1$, +which for $S>1$ produces at least one term in $i^k j^l$, $k,l>0$ other +than the $i^{S}j$ we want, and those extra terms cannot be absorbed by $g(j)$ +and $h(i)$. It may work if we convolute more than two sequences, +but this will require a greater convolution length. + +\subsection{What if $F(x)$ is symmetric?} +Let $F(x) = \sum_{j=1}^{d} f_j (x^j + x^{-j})$ of degree $2d$. +We would like +\begin{eqnarray*} +g_{i+d} & = & q^{h(i)} F(q^i) \\ + & = & h_i \sum_{j=1}^{d} f_j \left(q^{ij} + q^{-ij} \right) +\end{eqnarray*} +where $h_i$ may depend on $q$, $d$ and $i$, but not $j$. + +I am hopelessly stuck here. I can't find $b_j$ and $c_j$ so that +$c_{i-j}b_j = h_i f_j \left(q^{ij} + q^{-ij} \right)$. +\end{document} diff -Nru gmp-ecm-7.0.4+ds/techdocs/curve_convert.tex gmp-ecm-7.0.5+ds/techdocs/curve_convert.tex --- gmp-ecm-7.0.4+ds/techdocs/curve_convert.tex 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/techdocs/curve_convert.tex 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,48 @@ +\documentclass{article} +\pagestyle{empty} +\usepackage{amsmath} +\begin{document} +\newcommand{\C}{{\bf C}} + +\title{How to convert an elliptic curve from Montgomery to Weierstra\ss{} + coordinates} + +\maketitle + +Curve in Montgomery coordinates: +\begin{displaymath} +E: B Y^2 = X^3 + A X^2 + X +\end{displaymath} + +With a given point $(X, ?)$ we can set $Y=1$ and choose the isomorphic +curve +% +\begin{displaymath} +G Y^2 = X^3 + A X^2 + X +\end{displaymath} +% +with $G = X^3 + A X^2 + X$. Now (X, 1) is a valid point on this curve. +Dividing the equation by $G^3$ yields +% +\begin{displaymath} +\left(\frac{Y}{G}\right)^2 = \left(\frac{X}{G}\right)^3 + + \frac{A}{G} \left(\frac{X}{G}\right)^2 + + \frac{1}{G^2} \left(\frac{X}{G}\right) +\end{displaymath} +% +so we can set $x = X/G$, $y = Y/G$ and receive the curve +\begin{displaymath} +y^2 = x^3 + \frac{A}{G} x^2 + \frac{1}{G^2} x +\end{displaymath} + +This is not in short Weierstra\ss{} form yet. By choosing instead +$x = (X+A/3)/G$, $y = Y/G$, $a = (1-A^2/3)/G^2$, +$b = A (2 A^2 - 9)/(27 G^3)$, we get +\begin{displaymath} +y^2 = x^3 + a x + b +\end{displaymath} + +This is the desired curve isomorphic to $E$, with $(x,y)$ corresponding +to $(X,?)$ on $E$. + +\end{document} \ No newline at end of file diff -Nru gmp-ecm-7.0.4+ds/techdocs/mulrecip.tex gmp-ecm-7.0.5+ds/techdocs/mulrecip.tex --- gmp-ecm-7.0.4+ds/techdocs/mulrecip.tex 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/techdocs/mulrecip.tex 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,231 @@ +\documentclass{article} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} +\begin{document} +\newcommand{\Z}{\mathbb{Z}} +\newcommand{\Zn}[1]{(\Z/{#1}\Z)^{*}} +\newcommand{\tf}{\tilde{f}} +\newcommand{\tg}{\tilde{g}} +\newcommand{\rev}{\textrm{rev}} + +\title{Multiplication of reciprocal Laurent polynomials with a discrete + weighted transform} +\author{A. Kruppa} +\maketitle + +\begin{abstract} +This note attempts to develop a method for multiplying two reciprocal +Laurent polynomials of degrees $\leq n - 2$ using two regular polynomial +multiplications degree $\leq n/2 - 1$ each, or with a weighted DFT of +length $n$. +\end{abstract} + +\section{Multiplying reciprocal Laurent polynomials} + +Let $f(x)$ be a reciprocal Laurent polynomial +\begin{displaymath} +f(x) = f_0 + \sum_{i=1}^{d_f} f_i (x^i + x^{-i}) +\end{displaymath} +of degree $2d_f$ and $\tf(x) = \tf_0 + \sum_{i=1}^{d_f} \tf_i x^i$ with +$\tf_0 = f_0 / 2, \tf_i = f_i \textrm{ for } i>0$ a polynomial so that +$f(x) = \tf(x) + \tf(1/x)$. Likewise for $g(x)$ and $\tg(x)$. +Their product $h(x) = f(x)g(x)$ is a reciprocal Laurent polynomial of degree +$2(d_f + d_g)$, $h(x) = h_0 + \sum_{i=1}^{d_f + d_g} h_i (x^i + x^{-i})$. + +\subsection{Without DWT} + +Let $\rev(\tf(x)) = x^{d_f} \tf(1/x)$ denote the polynomial with +reversed sequence of coefficients. We have $\rev(\rev(\tf(x))) = \tf(x)$ and +$\rev(\tf(x) \tg(x)) = \rev(\tf(x)) \rev(\tg(x))$. +Let $\lfloor f(x) \rfloor$ denote a +polynomial whose coefficients at non-negative exponents of $x$ are equal to +those in $f(x)$, and whose coefficients at negative exponents of $x$ are +$0$. We have +$\lfloor f(x) + g(x) \rfloor = \lfloor f(x) \rfloor + \lfloor g(x) \rfloor$. + +Now we can compute the product +\begin{eqnarray*} + & & f(x)g(x) \\ + & = & (\tf(x) + \tf(1/x)) (\tg(x) + \tg(1/x)) \\ + & = & \tf(x) \tg(x) + \tf(x) \tg(1/x) + \tf(1/x)\tg(x) + \tf(1/x)\tg(1/x) \\ +% & = & \tf(x) \tg(x) + x^{-d_g} \tf(x) \rev(\tg(x)) + x^{-d_f} \rev(\tf(x))\tg(x) + \tf(1/x)\tg(1/x) \\ + & = & \tf(x) \tg(x) + x^{-d_g} \tf(x) \rev(\tg(x)) + x^{-d_f} \rev(\tf(x) \rev(\tg(x))) + \tf(1/x)\tg(1/x) +\end{eqnarray*} +but we only want to store the coefficients at non-negative exponents in the +product, so +\begin{eqnarray*} + & & \lfloor f(x)g(x) \rfloor \\ + & = & \tf(x) \tg(x) + \lfloor x^{-d_g} \tf(x) \rev(\tg(x)) \rfloor + \lfloor x^{-d_f} \rev(\tf(x) \rev(\tg(x))) \rfloor + \tf_0 \tg_0. +\end{eqnarray*} + +We can compute this with two multiplications of a degree $d_f$ and a +degree $d_g$ polynomial, as well as $d_f + d_g + 2$ additions and one doubling +of the $\tf_0 \tg_0$ term in $\tf(x) \tg(x)$. If an FFT based +multiplication routine is used, the forward transform of $\tf$ can be re-used +directly, and the forward transform of $\tg$ can be reused by observing that +the $i$-th coefficient in the length $l$ DFT of $\tg(x)$ is +$\tg(\omega^i)$, $\omega^l=1$, and in the DFT of $rev(\tg(x))$ is +$\omega^{id_g} \tg(\omega^{l-i})$, so the coefficients at indices $1 \ldots l-1$ +only need to be reversed in order and suitably weighted. (I haven't acrtually +tried this idea of re-using the forward transform) + +\subsection{With DWT} + +The product RLP $h(x)$ of degree $2d_h = 2(d_f+d_g)$ has $d_h + 1$ +possibly distinct coefficients in standard basis which we can obtain from +a discrete weighted FFT convolution of length $l \geq d_h + 1$. + +Suppose we compute $\hat{h}(x) = f(x) \cdot g(x) \bmod (x^l - 1/a)$, then +\begin{eqnarray*} + \hat{h}(x) & = & h_0 + \sum_{i=1}^{d_h} h_i (x^i + ax^{l-i}) \\ + & = & \sum_{i=0}^{d_h} h_i x^i + \sum_{i=1}^{d_h} ah_i x^{l-i}\\ + & = & \sum_{i=0}^{d_h} h_i x^i + \sum_{i=l-d_h}^{l-1} ah_{l-i} x^i\\ + & = & \sum_{i=0}^{l-d_h-1} h_i x^i + + \sum_{i=l-d_h}^{d_h} (h_i + ah_{l-i}) x^i + + \sum_{i=d_h+1}^{l-1} ah_{l-i} x^{i} +\end{eqnarray*} + +For $l-d_h \leq i < l/2$, $\hat{h}_i = h_i + ah_{l-i}$ and +$\hat{h}_{l-i} = ah_i + h_{l-i}$. With $a=1$ as in an unweighted FFT, +$h_i$ and $h_{n-i}$ cannot be separated as $\hat{h}_i = \hat{h}_{l-i}$. +With $a=-1$, they cannot be separated either, as +$\hat{h}_i = - \hat{h}_{l-i}$. +With, e.g., $a=\sqrt{-1}$, we have +\begin{eqnarray*} +a (h_i + ah_{l-i}) - (ah_i + h_{l-i}) & = & (a^2-1) h_{l-i} \\ + & = & -2 h_{l-i} +\end{eqnarray*} +so we can separate $h_i$ and $h_{l-i}$. Hence after processing +\begin{eqnarray*} +\hat{h}_{l-i} & := & -(a \hat{h}_i - \hat{h}_{l-i})/2 \\ +\hat{h}_i & := & \hat{h}_i - a\hat{h}_{l-i} +\end{eqnarray*} +for $l-d_h \leq i < l/2$, we have the desired coefficients $h_i$ for +$0 \leq i \leq d_h$ in $\hat{h}_i$. + + +% Vecdiv(a,b)={if(length(a)!=length(b),error("Vecdiv: vectors of unequal length"));vector(length(a),i,a[i]/b[i])} +% Vecmul(a,b)={if(length(a)!=length(b),error("Vecdiv: vectors of unequal length"));vector(length(a),i,a[i]*b[i])} +% Vecdiv(Vecrev(Polrev([f0,f1*w,f2*w^2,f3*w^3,0*w^4,f3*w^-3,f2*w^-2,f1*w^-1])^2 % (x^8-1)), [1,w^1,w^-6,w^-5,w^-4,w^-3,w^-2,w^-1]) + +% (Vecrev((f0 + f1*(x+1/x) + f2*(x^2+1/x^2) + f3*(x^3+1/x^3) + f4*(x^4+1/x^4))^2 % (x^10-w^10)) - Vecdiv(Vecrev(Polrev(Vecmul([f0,f1,f2,f3,f4,0,f4,f3,f2,f1],[1,w,w^2,w^3,w^4,w^5,w^-4,w^-3,w^-2,w^-1]))^2 % (x^10-1)), [1,w^1,w^2,w^3,w^4,w^5,w^6,w^7,w^8,w^9])) + +% (Vecrev(f(x)^2 % (x^10-w^10)) - Vecdiv(Vecrev(Polrev(Vecrev(f(w*x) % (x^10-1)))^2 % (x^10-1)), [1,w^1,w^2,w^3,w^4,w^5,w^6,w^7,w^8,w^9])) + +To obtain the desired product $\bmod{(x^l-1/a)}$, we can choose a +constant $w^l=1/a$ and compute the product +$h(wx) \bmod{(x^l-1)} = +(f(wx) \bmod{(x^l-1)} \cdot g(wx) \bmod{(x^l-1)}) \bmod {(x^l-1)}$. We have +\begin{eqnarray*} + h(wx) \bmod{(x^l-1)} & = & \sum_{i=0}^{d_h} (w^i h_i x^i + w^{-i} h_i x^{l-i}) \\ + & = & \sum_{i=0}^{d_h} w^i h_i x^i + \sum_{i=0}^{d_h} w^{-i} h_i x^{l-i} \\ + & = & \sum_{i=0}^{d_h} w^i h_i x^i + \sum_{i=l-d_h}^l w^{i-l} h_{l-i} x^i \\ + & = & \sum_{i=0}^{l-d_h-1} w^i h_i x^i + \sum_{i=l-d_h}^l (w^i h_i + w^{i-l} h_{l-i}) x^i \\ + & = & \sum_{i=0}^{l-d_h-1} w^i h_i x^i + \sum_{i=l-d_h}^l w^i (h_i + a h_{l-i}) x^i +\end{eqnarray*} +so dividing the $i$-th coefficient by $w^i$ yields $\hat{h}_i$ as desired. + +For example, with two degree $6$ reciprocal Laurent polynomials and a length +$8$ convolution, the coefficient vectors of $f(wx) \bmod{(x^8-1)}$ and +$g(wx) \bmod{(x^8-1)}$ +are $(f_0, w f_1, w^2 f_2, w^3 f_3, 0, w^{-3} f_3, w^{-2} f_2, w^{-1} f_1)$ +and \\ +$(g_0, w g_1, w^2 g_2, w^3 g_3, 0, w^{-3} g_3, w^{-2} g_2, w^{-1} g_1)$, +respectively. +Their product $\bmod{(x^8-1)}$ has coefficient vector +\begin{eqnarray*} +&& (f_0 g_0 + 2(f_1 g_1 + f_2 g_2 + f_3 g_3), \\ +&& (f_1 g_0 + (f_0 + f_2) g_1 + (f_1 + f_3) g_2 + f_2 g_3) w, \\ +&& ((f_2 g_0 + (f_1 + f_3) g_1 + f_0 g_2 + f_1 g_3) w^8 + f_3 g_3)/w^6, \\ +&& ((f_3 g_0 + f_2 g_1 + f_1 g_2 + f_0 g_3) w^8 + f_3 g_2 + f_2 g_3)/w^5, \\ +&& ((f_3 g_1 + f_2 g_2 + f_1 g_3) w^8 + f_3 g_1 + f_2 g_2 + f_1 g_3)/w^4, \\ + && ((f_3 g_2 + f_2 g_3) w^8 + f_3 g_0 + f_2 g_1 + f_1 g_2 + f_0 g_3)/w^3, \\ +&& (f_3 g_3 w^8 + f_2 g_0 + (f_1 + f_3) g_1 + f_0 g_2 + f_1 g_3)/w^2, \\ +&& (f_1 g_0 + (f_0 + f_2) g_1 + (f_1 + f_3) g_2 + f_2 g_3)/w). +\end{eqnarray*} +With +\begin{eqnarray*} +&& h_0 = f_0 g_0 + 2 f_1 g_1 + 2 f_2 g_2 + 2 f_3 g_3 \\ +&& h_1 = f_1 g_0 + (f_0 + f_2) g_1 + (f_1 + f_3) g_2 + f_2 g_3 \\ +&& h_2 = f_2 g_0 + (f_1 + f_3) g_1 + f_0 g_2 + f_1 g_3 \\ +&& h_3 = f_3 g_0 + f_2 g_1 + f_1 g_2 + f_0 g_3 \\ +&& h_4 = f_3 g_1 + f_2 g_2 + f_1 g_3 \\ +&& h_5 = f_3 g_2 + f_2 g_3 \\ +&& h_6 = f_3 g_3 \\ +\end{eqnarray*} +this vector is equal to +$(h_0, h_1 w, h_2 w^2 + h_6 w^{-6}, h_3 w^3 + h_5 w^{-5}, h_4 w^4 + h_4 w^{-4}, + h_5 w^5 + h_3 w^{-3}, h_6 w^6 + h_2 w^{-2}, h_1 w^{-1})$ +and after dividing the $i$-th coefficient by $w^i$ is equal to +$(h_0, h_1, h_2 + h_6 w^{-8}, h_3 + h_5 w^{-8}, h_4 + h_4 w^{-8}, + h_5 + h_3 w^{-8}, h_6 + h_2 w^{-8}, h_1 w^{-8}) = (h_0, h_1, h_2 + a h_6, +h_3 + a h_5, h_4 + a h_4, h_5 + a h_3, h_6 + a h_2, a h_1)$. +For $0 \leq i < l - d_h = 2$, the coefficients $h_i$ can be read directly. +For $l - d_h \leq i \leq l/2$, coefficients $i$ and $l-i$ overlap and must +be separated as shown above. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\subsection{With DWT (rewrite)} + +Let $Q(X) = q_0 + \sum_{j=1}^{d_q} q_j (X^j + X^{-j})$ be an RLP of degree +$2d_q \leq 2l-2$ and likewise $R(X)$ an RLP of degreee $2d_r \leq 2l-2$. +To obtain the product RLP $S(X) = Q(x) R(x)$ of degree $2d_s = 2(d_q+d_r)$, +we can perform a weighted +convolution product by computing $\tilde{S}(wx) = Q(wX) R(wX) \bmod{(X^l-1)}$, +$w^l = \sqrt{-1}$, and separating the wrapped-around coefficients in +$\tilde{S}(X)$ to obtain the coefficients of $S(X)$. Since $w$ is a $4l$-th +root of unity we need to choose NTT primes $p_j \equiv 1 \pmod{4l}$. + +With $\tilde{S}(wX) = \sum_{i=0}^{l-1} \tilde{s}_j x^j = S(wX) \bmod (X^l-1)$ +we have +\begin{eqnarray*} + \tilde{S}(wX) + & = & \sum_{j=0}^{d_s} (w^j s_j x^j + w^{-j} s_j x^{l-i}) \\ + & = & \sum_{j=0}^{d_s} w^j s_j x^j + \sum_{j=0}^{d_s} w^{-j} s_j x^{l-i} \\ + & = & \sum_{j=0}^{d_s} w^j s_j x^j + \sum_{j=l-d_s}^l w^{i-l} s_{l-i} x^i \\ + & = & \sum_{j=0}^{l - d_s - 1} w^j s_j x^j + + \sum_{j=l-d_s}^l w^j (s_j + w^l s_{l-j}) x^j +\end{eqnarray*} +Hence for $0 \leq i < l - d_s$ we can read $s_i$ directly off $\tilde{s}_i$ +while the remaining coefficients first need to be separated by using +$w^l \tilde{s}_j - \tilde{s}_{l-j} = w^l (s_j + w^l s_{l-j}) - +(w^l s_j + s_{l-j}) = (w^{2l}-1) s_{l-j} = -2 s_{l-j}.$ + +These ideas yield the algorithm shown in figure~\ref{DWTNTT}. Since we need +only squaring of RLPs in section \ref{}, we show only the squaring algorithm +here. + +\begin{figure}[ht] +\begin{center} +\begin{tabular}{l} +\hline + Input: RLP $Q(X) = \sum_{j=0}^{d_q} q_j (x^j + x^{-j})$ of degree $2d_q$ + in standard basis \\ + Output: RLP $S(X) = \sum_{j=0}^{d_s} s_j (x^j + x^{-j}) = Q(X)^2$ of degree + $2d_s = 4d_q$ \\ in standard basis \\ + Auxiliary storage: A length $l > 2d_s$ NTT array $M$ with \\ + separate memory for vectors mod each $p_j$\\ +\hline + For each prime $p_j$ \\ +\qquad Compute $w$ with $w^{2l} \equiv -1 \pmod{p_j}$ \\ +\qquad For $0 \leq i \leq d_q$ store $w^i q_i \pmod{p_j}$ in $M_i$ \\ +\qquad For $1 \leq i \leq d_q$ store $w^{-i} q_i \pmod{p_j}$ in $M_{l-i}$ \\ +\qquad Perform forward NTT modulo $p_j$ on $M$, square elementwise and \\ +\qquad \quad perform inverse +NTT \\ +\qquad For $1 \leq i \leq d_s$ set $M_i := w^{-i} M_i \pmod{p_j}$ \\ +\qquad For $l - d_s < i \leq l/2$ \\ +\qquad \qquad Set $M_{l-i} := -(w^l M_i - M_{l-i})/2 \pmod{p_j}$ \\ +\qquad \qquad If $i < l/2$ set $M_i := M_i - w^l M_{l-i} \pmod{p_j}$ \\ + For $0 \leq i \leq d_s$ perform CRT on $M_i$ residues to obtain $s_i$, store + in output \\ +\hline +\end{tabular} +\end{center} +\caption{NTT based squaring algorithm for Reciprocal Laurent polynomials} +\label{DWTNTT} +\end{figure} + +\end{document} diff -Nru gmp-ecm-7.0.4+ds/techdocs/scaleV.tex gmp-ecm-7.0.5+ds/techdocs/scaleV.tex --- gmp-ecm-7.0.4+ds/techdocs/scaleV.tex 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/techdocs/scaleV.tex 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,216 @@ +\documentclass[a4paper]{article} +\usepackage[latin1]{inputenc} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{amsthm} +\usepackage[noline]{algorithm2e} +\usepackage{graphicx} +\usepackage{verbatim} + +\newcommand{\F}{\mathbb{F}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\Z}{\mathbb{Z}} +\newcommand{\Q}{\mathbb{Q}} +\newcommand{\R}{\mathbb{R}} +\newcommand{\C}{\mathbb{C}} + +\begin{document} +\title{Simpler scaling an RLP by a constant and its reciprocal} +\author{A. Kruppa} +\maketitle +\abstract{We present a simpler algorithm for scaling a reciprocal Laurent +polynomial by a power and its inverse, a replacement for the algorithm +presented in \cite{PMP1}[7.1].} + + +\section{Scaling} + +Let $V_n(x)$ be a Chebyshev polynomial of degree $n$, defined by the +functional equation $V_n(x+x^{-1}) = x^n + x^{-n}$, e.g., $V_0(x)=2$ and +$V_1(x)=x$. We define $V_{-n}(x) = V_n(x)$. +These polynomials satisfy many identities such as +$V_{m+n}(x) = V_m(x)V_n(x) - V_{m-n}(x)$, which implies +% $V_{n+1}(x) = xV_n(x) - V_{n-1}(x)$ and +$V_{2n}(x) + 2 = V_n(x)^2$. + +For a given reciprocal Laurent polynomial (RLP) in standard basis +\begin{eqnarray*} + F(x) & = & f_0 + \sum_{i=1}^{d} f_i V_i(x+x^{-1}) \\ + & = & \sum_{i=-d}^d f_{|i|} x^i, +\end{eqnarray*} +of degree $2d$, +we want to compute +$F(\gamma x)F(\gamma^{-1} x)$, +an RLP of degree $4d$, with $Q = \gamma + \gamma^{-1}$. +We know $Q$ but not necessarily $\gamma$. + +We have +\begin{displaymath} + F^2(x) = G(x) = g_0 + \sum_{i=1}^{2d} g_i V_i(x+x^{-1}) +\end{displaymath} +with, for $0 \leq k \leq 2d$, +\begin{eqnarray*} + g_k +% & = & \sum_{-d \leq i,j \leq d} [i+j = k] f_i f_j \\ +% & = & \sum_{-d \leq i,j \leq d, i+j = k} f_i f_j \\ +% & = & \sum_{-d \leq i,j \leq d, i+j = k} f_i f_{k-i} \\ +% & = & \sum_{-d \leq i \leq d, -d \leq k-i \leq d} f_i f_{k-i} \\ +% & = & \sum_{-d \leq i \leq d, -d-k \leq -i \leq d-k} f_i f_{k-i} \\ +% & = & \sum_{-d \leq i \leq d, k-d \leq i \leq d+k} f_i f_{k-i} \\ +% & = & \sum_{\max(-d,k-d) \leq i \leq \min(d,d+k)} f_i f_{k-i} \\ + & = & \sum_{k-d \leq i \leq d} f_i f_{k-i}. \\ +\end{eqnarray*} +For example, for $d=5$, +\begin{eqnarray*} +g_0 & = & 2 f_{5}^2 + 2 f_{4}^2 + 2 f_{3}^2 + 2 f_{2}^2 + 2 f_{1}^2 + f_{0}^2 \\ +g_1 & = & 2 f_{4} f_{5} + 2 f_{3} f_{4} + 2 f_{2} f_{3} + 2 f_{1} f_{2} + 2 f_{0} f_{1} \\ +g_2 & = & 2 f_{3} f_{5} + 2 f_{2} f_{4} + 2 f_{1} f_{3} + 2 f_{0} f_{2} + f_{1}^2 \\ +g_3 & = & 2 f_{2} f_{5} + 2 f_{1} f_{4} + 2 f_{0} f_{3} + 2 f_{1} f_{2} \\ +g_4 & = & 2 f_{1} f_{5} + 2 f_{0} f_{4} + 2 f_{1} f_{3} + f_{2}^2 \\ +g_5 & = & 2 f_{0} f_{5} + 2 f_{1} f_{4} + 2 f_{2} f_{3} \\ +\pagebreak +g_6 & = & 2 f_{1} f_{5} + 2 f_{2} f_{4} + f_{3}^2 \\ +g_7 & = & 2 f_{2} f_{5} + 2 f_{3} f_{4}\\ +g_8 & = & 2 f_{3} f_{5} + f_{4}^2 \\ +g_9 & = & 2 f_{4} f_{5} \\ +g_{10} & = & f_{5}^2 \\ +\end{eqnarray*} +For $0 \leq k \leq 2d$ and $k$ even +\begin{displaymath} + g_k = \left( \sum_{k-d \leq i < k/2} 2 f_i f_{k-i} \right) + f_{k/2}^2 +\end{displaymath} +and for $k$ odd +\begin{displaymath} + g_k = \sum_{k-d \leq i \leq (k-1)/2} 2 f_i f_{k-i} +\end{displaymath} + + +%Let $U_n(x)$ be a Chebyshev polynomial defined by the functional equation +%$U_n(x + x^{-1}) = \frac{x^n - x^{-n}}{x - x^{-1}}$. +%Therefore, +%$U_0(x + x^{-1}) = 0$ and so $U_0(x) = 0$, +%$U_1(x + x^{-1}) = 1$ and so $U_1(x) = 1$, +%$U_2(x + x^{-1}) = x + x^{-1}$ and so $U_2(x) = x$, +%and $U_{m+n}(x) = U_m(x)V_n(x) - U_{m-n}(x)$, +%in particular +%$U_{m+1}(x) = U_m(x)x - U_{m-1}(x)$. +%For $n > 0$, the degree of $U_n(x)$ is $n-1$. +%$V_{m-n}(x) = V_m(x)V_n(x) - V_{m+n}(x)$. + +We want +\begin{displaymath} + F(\gamma x)F(\gamma^{-1} x) = H(x) = h_0 + \sum_{i=1}^{2d} h_i V_i(x+x^{-1}). +\end{displaymath} +For example, for $d = 5$, +\begin{eqnarray*} +h_0 & = & V_{10}(Q) f_{5}^2 + V_{8}(Q) f_{4}^2+ V_{6}(Q) f_{3}^2 + V_{4}(Q) f_{2}^2+ V_{2}(Q) f_{1}^2+ f_{0}^2 \\ +h_1 & = & V_{9}(Q) f_{4} f_{5}+ V_{7}(Q) f_{3} f_{4}+ V_{5}(Q) f_{2} f_{3} + V_{3}(Q) f_{1} f_{2}+ V_{1}(Q) f_{0} f_{1} \\ +h_2 & = & V_{8}(Q) f_{3} f_{5}+ V_{6}(Q) f_{2} f_{4}+ V_{4}(Q) f_{1} f_{3} + V_{2}(Q) f_{0} f_{2}+ f_{1}^2 \\ +h_3 & = & V_{7}(Q) f_{2} f_{5}+ V_{5}(Q) f_{1} f_{4}+ V_{3}(Q) f_{0} f_{3} + V_{1}(Q) f_{1} f_{2} \\ +h_4 & = & V_{6}(Q) f_{1} f_{5}+ V_{4}(Q) f_{0} f_{4}+ V_{2}(Q) f_{1} f_{3} + f_{2}^2 \\ +h_5 & = & V_{5}(Q) f_{0} f_{5}+ V_{3}(Q) f_{1} f_{4}+ V_{1}(Q) f_{2} f_{3} \\ +h_6 & = & V_{4}(Q) f_{1} f_{5}+ V_{2}(Q) f_{2} f_{4}+ f_{3}^2 \\ +h_7 & = & V_{3}(Q) f_{2} f_{5}+ V_{1}(Q) f_{3} f_{4}\\ +h_8 & = & V_{2}(Q) f_{3} f_{5}+ f_{4}^2\\ +h_9 & = & V_{1}(Q) f_{4} f_{5}\\ +h_{10} & = & f_{5}^2 +\end{eqnarray*} +For $0 \leq k \leq 2d$ and $k$ even +\begin{eqnarray*} + h_k & = & \sum_{k-d \leq i \leq d} \gamma^{2i-k} f_i f_{k-i} \\ +% & = & \sum_{k-d \leq i < k/2} \gamma^{2i-k} f_i f_{k-i} + +% f_{k/2}^2 + +% \sum_{k/2 < i \leq d} \gamma^{2i-k} f_i f_{k-i} \\ +% & = & \sum_{k-d \leq i < k/2} \gamma^{2i-k} f_i f_{k-i} + +% f_{k/2}^2 + +% \sum_{k-d \leq i < k/2} \gamma^{k-2i} f_{k-i} f_{i} \\ +% & = & \sum_{k-d \leq i < k/2} (\gamma^{2i-k} + \gamma^{k-2i}) f_i f_{k-i} + +% f_{k/2}^2 \\ + & = & \left( \sum_{k-d \leq i < k/2} V_{2i-k}(Q) f_i f_{k-i} \right) + + f_{k/2}^2 \\ +% & = & \left( \sum_{k-d \leq i < k/2} V_{2i-k}(Q) f_i f_{k-i} \right) + +% \frac{V_0(Q)}{2} f_{k/2}^2 \\ +\end{eqnarray*} +and for $k$ odd +\begin{eqnarray*} + h_k & = & \sum_{k-d \leq i \leq d} \gamma^{2i-k} f_i f_{k-i} \\ +% & = & \sum_{k-d \leq i \leq (k-1)/2} \gamma^{2i-k} f_i f_{k-i} + +% \sum_{(k+1)/2 \leq i \leq d} \gamma^{2i-k} f_i f_{k-i} \\ + & = & \sum_{k-d \leq i \leq (k-1)/2} V_{2i-k}(Q) f_i f_{k-i}. +\end{eqnarray*} + +Using $V_{2i-k}(x) = V_i(x)V_{k-i}(x) - V_k(x)$ we can write, for $k$ even, +\begin{eqnarray*} + h_k & = & \sum_{k-d \leq i < k/2} \left( V_i(x)V_{k-i}(Q) - V_k(Q) \right) f_i f_{k-i} + + f_{k/2}^2 \\ + & = & \sum_{k-d \leq i < k/2} (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + - V_k(Q) \sum_{k-d \leq i < k/2} f_i f_{k-i} + + f_{k/2}^2 \\ + & = & \sum_{k-d \leq i < k/2} (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + - \frac{V_k(Q)}{2} (g_k - f_{k/2}^2) + + f_{k/2}^2 \\ + & = & \sum_{k-d \leq i < k/2} (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + - \frac{V_k(Q)}{2} g_k + + \frac{V_k(Q) + 2}{2} f_{k/2}^2 \\ + & = & \sum_{k-d \leq i < k/2} (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + - \frac{V_k(Q)}{2} g_k + + \frac{V_{k/2}(Q)^2}{2} f_{k/2}^2 \\ + & = & \sum_{k-d \leq i < k/2} (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + - \frac{V_k(Q)}{2} g_k + + \frac{1}{2} ( V_{k/2}(Q) f_{k/2} )^2 \\ + & = & \frac{1}{2} \left( + \sum_{k-d \leq i < k/2} 2 (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + + ( V_{k/2}(Q) f_{k/2} )^2 \right) + - \frac{V_k(Q)}{2} g_k \\ +\end{eqnarray*} +and for $k$ odd +\begin{eqnarray*} + h_k & = & \sum_{k-d \leq i \leq (k-1)/2} (V_i(x)V_{k-i}(Q) - V_k(Q)) f_i f_{k-i} \\ + & = & \sum_{k-d \leq i < k/2} (V_i(Q) f_i) (V_{k-i}(Q) f_{k-i}) + - \frac{V_k(Q)}{2} g_k +\end{eqnarray*} +In both cases, even and odd, we can write +\begin{equation} + h_k = \frac{1}{2} (\tilde{g}_k - V_k(Q)g_k) +\end{equation} +where +$\tilde{G}(x) = \tilde{F}(x)^2 = \tilde{g}_0 + \sum_{i=1}^{2d} \tilde{g}_k +V_i(x+x^{-1})$ with +\begin{displaymath} +\tilde{F}(x) = V_0(Q) f_0 + \sum_{i=1}^{2d} V_i(Q) f_k V_i(x+x^{-1}). +\end{displaymath} + +Thus the algorithm could informally described as ''square F then add weights, +add weights to F then square, take half the difference,'' where the weight +signal is $V_i(Q)$ for $i = 0, 1, 2, \ldots$ + +The algorithm can work in-place, using only storage for input and output +polynomial, which may overlap, and for the NTT vectors. +We first square F in the NTT buffer N, then +for each i do + $t := V_i(Q) \cdot \mbox{from\_NTT}(N_i);$ + $N_i = \mbox{to\_NTT}(V_i(Q) \cdot F_i);$ + $R_i = t;$ +then do the squaring in the NTT buffer, subtract the result from R and +divide by 2. + +\subsection{Ramblings} +Doing the weighting on-the-fly will need new/modified from\_ntt/to\_ntt +functions. Maybe use Jason's idea of callback functions to fill in data? + +When using transform-based convolution products, the subtraction could be done +in transform space, eliminating one inverse transform and so reducing the +cost from that of two squarings to the equivalent of one general +multiplication, but at the cost of storing one additional NTT buffer. + +It is possible to compute the $V_i(Q)$ in NTT form and multiply the polynomial +coefficients by these weights in NTT form as well, which may perhaps be faster +than doing it with normal REDC etc. Not sure if that would save +much. + +\begin{thebibliography}{9} + \bibitem{PMP1} P. L. Montgomery and A. Kruppa, Improved Stage 2 to P$\pm$1 Factoring Algorithms, in Proceedings of ANTS-VIII 2008, LNCS 5011, Springer, pp. 180--195. +\end{thebibliography} + +\end{document} + diff -Nru gmp-ecm-7.0.4+ds/techdocs/schoen_strass.tex gmp-ecm-7.0.5+ds/techdocs/schoen_strass.tex --- gmp-ecm-7.0.4+ds/techdocs/schoen_strass.tex 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/techdocs/schoen_strass.tex 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,85 @@ +\documentclass{article} +\usepackage{amsmath} +\DeclareMathOperator{\len}{len} +\DeclareMathOperator{\mult}{mul^{T}} + +\begin{document} + +\title{Sch\"onhage-Strassen multipication for GMP-ECM} +\author{A. Kruppa} +\maketitle + +\section{Transposed multiplication} +\subsection{Transposed Karatsuba} +Interpret the input vectors $A$ and $B$ of length $\len(A)$ and $\len(B)$, +respectively, as polynomials $A(x) = \sum_{i=0}^{\len(A)-1} a_i x^i$ and +$B(x) = \sum_{i=0}^{\len(B)-1} b_i x^i$ of degree $\len(A)-1$ and $\len(B)-1$, +respectively. Assume $4\mid \len(B)$ and let $h = \len(B)/4$. +Set +\begin{displaymath} +B_k = \sum_{i=0}^{h-1} b_{i+kh} x^i, \textrm{for } k = 0, 1, 2, 3, +\end{displaymath} +that is, cut $B$ into four equal-size pieces. +Assume $\len(A) = 2h$ or $\len(A) = 2h+1$ and set +\begin{eqnarray*} +A_0 & = & \sum_{i=0}^{h-1} a_{i} x^i, \\ +A_1 & = & \sum_{i=0}^{\len(A)-h-1} a_{i+h} x^i. +\end{eqnarray*} +with $a_{2h} = 0$ if $\len(A) = 2h$. + +Let $R = \mult(A, B)$ where $R$ is a polynomial of degree $\deg(R)$, i.e. +% +\begin{displaymath} +R(x) = \sum_{i=0}^{\deg(R)} r_i x^i, +\end{displaymath} +% +\begin{displaymath} +r_i = \sum_{j=0}^{\len(A)-1} a_j b_{j+i}, \textrm{for } 0\leq i \leq \deg(R). +\end{displaymath} + +\pagebreak{} + +Set $T(X) = \mult(A_0 + A_1, B_1 + B_2 x^h)$ +so that +\begin{eqnarray*} +t_i & = & \sum_{j=0}^{h-1} (a_j + a_{j+h}) b_{j+i+h} + a_{2h} b_{i+2h} \\ + & = & \sum_{j=0}^{h-1} (a_j b_{j+i+h} + a_{j+h} b_{j+i+h}) + a_{2h} b_{i+2h} \\ +& & \textrm{for } 0\leq i < h. +\end{eqnarray*} + +Set $U(x) = \mult(A_0, (B_0 - B_1) + (B_1 - B_2) x^h)$ +so that +\begin{eqnarray*} +u_i & = & \sum_{j=0}^{h-1} a_j (b_{j+i} - b_{j+i+h}) \\ + & = & \sum_{j=0}^{h-1} (a_j b_{j+i} - a_j b_{j+i+h}) \\ +& & \textrm{for } 0\leq i < h. +\end{eqnarray*} + +Set $V(x) = \mult(A_1, (B_2 - B_1) + (B_3 - B_2) x^h)$ +so that +\begin{eqnarray*} +v_i & = & \sum_{j=0}^{h-1} a_{j+h} (b_{j+i+2h} - b_{j+i+h}) + a_{2h}(b_{i+3h} - b_{i+2h})\\ + & = & \sum_{j=0}^{h-1} (a_{j+h} b_{j+i+2h} - a_{j+h} b_{j+i+h}) + a_{2h} b_{i+3h} - a_{2h} b_{i+2h}\\ +& & \textrm{for } 0\leq i < h. +\end{eqnarray*} + +Let $R(x) = T(x) + U(x)$ so that +\begin{eqnarray*} +r_i & = & \sum_{j=0}^{h-1} (a_j b_{j+i+h} + a_{j+h} b_{j+i+h} + a_j b_{j+i} - a_j b_{j+i+h}) + a_{2h} b_{i+2h} \\ + & = & \sum_{j=0}^{h-1} (a_{j+h} b_{j+i+h} + a_j b_{j+i}) + a_{2h} b_{i+2h} \\ + & = & \sum_{j=0}^{2h-1} a_j b_{j+i} + a_{2h} b_{i+2h} \\ + & = & \sum_{j=0}^{\len(A)-1} a_j b_{j+i} +\end{eqnarray*} + +Let $S(x) = T(x) + V(x)$ so that +\begin{eqnarray*} +s_i & = & \sum_{j=0}^{h-1} (a_j b_{j+i+h} + a_{j+h} b_{j+i+h} + a_{j+h} b_{j+i+2h} - a_{j+h} b_{j+i+h}) \\ + & &+ a_{2h} b_{i+2h} + a_{2h} b_{i+3h} - a_{2h} b_{i+2h}\\ + & = & \sum_{j=0}^{h-1} (a_j b_{j+i+h} + a_{j+h} b_{j+i+2h}) + a_{2h} b_{i+3h} \\ + & = & \sum_{j=0}^{2h-1} a_j b_{j+i+h} + a_{2h} b_{i+3h} \\ + & = & \sum_{j=0}^{\len(A)-1} a_j b_{j+i+h}. +\end{eqnarray*} + +Now set $r_{i+h} = s_i$ for $0 \leq i < h$. This way, +$r_i = \sum_{j=0}^{len(A)-1} a_j b_{j+i}$ for $0 \leq i < 2h$, as desired. +\end{document} diff -Nru gmp-ecm-7.0.4+ds/test.cgbnecm gmp-ecm-7.0.5+ds/test.cgbnecm --- gmp-ecm-7.0.4+ds/test.cgbnecm 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test.cgbnecm 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,23 @@ +#!/bin/sh + +# test file for ECM on GPU with CGBN +# +# Copyright 2021 +# Seth Troisi +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; see the file COPYING. If not, see +# http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., +# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + +./test.gpuecm "${1:-./ecm -cgbn}" diff -Nru gmp-ecm-7.0.4+ds/test_dummy2.save gmp-ecm-7.0.5+ds/test_dummy2.save --- gmp-ecm-7.0.4+ds/test_dummy2.save 2016-06-14 14:54:33.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test_dummy2.save 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ - - - # this is a comment line and should be ignored diff -Nru gmp-ecm-7.0.4+ds/test.ecm gmp-ecm-7.0.5+ds/test.ecm --- gmp-ecm-7.0.4+ds/test.ecm 2016-08-22 11:05:41.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test.ecm 2022-06-06 14:16:49.000000000 +0000 @@ -78,6 +78,12 @@ # exercise the -primetest option echo 2050449353925555290706354283 | $ECM -primetest -param 0 -sigma 7 -k 1 30 0-1e6; checkcode $? 14 +# test the -primetest option +echo 69281828573340491 | $ECM -primetest -param 1 -sigma 758224547 -k 1 2e4; checkcode $? 0 +if [ "$MUL" != "redc"]; then +echo 69281828573340491 | $ECM -param 1 -sigma 758224547 -k 1 2e4; checkcode $? 8 +fi + # exercise the -q option echo 2050449353925555290706354283 | $ECM -q -param 0 -sigma 7 -k 1 30 0-1e6; checkcode $? 14 @@ -100,17 +106,19 @@ echo 2050449353925555290706354283 | $ECM -param 0 -sigma 7 -I 1 -c 3 100; checkcode $? 14 # check the -chkpnt option -echo 2050449353925555290706354283 | $ECM -chkpnt test.ecm.chk -param 0 -sigma 7 30 1e6 -$ECM -resume test.ecm.chk 30 1e6 +TEST=test.ecm.chk$$ +echo 2050449353925555290706354283 | $ECM -chkpnt $TEST -param 0 -sigma 7 30 1e6 +$ECM -resume $TEST 30 1e6 C=$? -/bin/rm -f test.ecm.chk +/bin/rm -f $TEST checkcode $C 14 # check the -inp option -echo 2050449353925555290706354283 > test.ecm.inp -$ECM -inp test.ecm.inp -param 0 -sigma 7 -I 1 -c 3 100 +TEST=test.ecm.inp$$ +echo 2050449353925555290706354283 > $TEST +$ECM -inp $TEST -param 0 -sigma 7 -I 1 -c 3 100 C=$? -/bin/rm -f test.ecm.inp +/bin/rm -f $TEST checkcode $C 14 # Check a stage 2 of length 1. g1=1822795201 g2=968809 g3=567947 @@ -190,11 +198,41 @@ # exercise "stage 2 interval too large" error message echo "2^1123-1" | $ECM -v -maxmem 10 11000 1e27; checkcode $? 1 -# exercise "Function Phi() requires 2 parameters" error message -echo "phi(31)" | $ECM 11e3; checkcode $? 1 +# exercise "unknown function" error message +echo "mer(31)" | $ECM 11e3; checkcode $? 1 -# exercise "Invalid parameter passed to the Phi function" error message -echo "phi(31,-1)" | $ECM 11e3; checkcode $? 1 + +# exercise "Function requires x parameters" error message +echo "phi(31)" | $ECM 11e3; checkcode $? 1 +echo "phi(31,5,8)" | $ECM 11e3; checkcode $? 1 +echo "phil(31)" | $ECM 11e3; checkcode $? 1 +echo "phim(31,5,8)" | $ECM 11e3; checkcode $? 1 +echo "u(534)" | $ECM 11e3; checkcode $? 1 +echo "primu(1,534)" | $ECM 11e3; checkcode $? 1 +echo "gcd(35)" | $ECM 11e3; checkcode $? 1 +echo "gcd(2,6,60)" | $ECM 11e3; checkcode $? 1 + + +# exercise "Invalid parameter passed to function" error message +echo "phi(-5,2)" | $ECM 11e3; checkcode $? 1 +echo "phi(2^67-1,2)" | $ECM 11e3; checkcode $? 1 +echo "u(1,-1,-6)" | $ECM 11e3; checkcode $? 1 +echo "u(1,-1,2^67-1)" | $ECM 11e3; checkcode $? 1 +echo "primu(1,-1,-6)" | $ECM 11e3; checkcode $? 1 +echo "primu(0,-1,17)" | $ECM 11e3; checkcode $? 1 +echo "primu(2,4,17)" | $ECM 11e3; checkcode $? 1 +echo "primu(6,9,17)" | $ECM 11e3; checkcode $? 1 + +# exercise "base %Zd not supported for Aurifeullian factorization yet" error message +echo "phil(84,14)" | $ECM 11e3; checkcode $? 1 +echo "phil(39,13)" | $ECM 11e3; checkcode $? 1 +echo "phil(45,-15)" | $ECM 11e3; checkcode $? 1 + +# exercise "exponent %Zd does not make sense for base %Zd" error message +echo "phil(256,2)" | $ECM 11e3; checkcode $? 1 +echo "phim(166,2)" | $ECM 11e3; checkcode $? 1 +echo "phim(30,5)" | $ECM 11e3; checkcode $? 1 +echo "phil(30,10)" | $ECM 11e3; checkcode $? 1 # exercise "Error - invalid number" error message echo ")" | $ECM 11e3; checkcode $? 1 @@ -227,19 +265,28 @@ echo "2^1123-1" | $ECM -param 0 -pm1 11000; checkcode $? 1 # exercise "Could not open file for reading" error message -$ECM -bloads sfile.txt 1e6 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +$ECM -param 1 -bloads sfile.txt 1e6 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +$ECM -param 1 -bloads sfile.txt 1e6 < ${GMPECM_DATADIR}/c155 2>&1 | grep "Could not open file sfile.txt"; checkcode $? 0 # exercise "read_s_from_file: 0 bytes read from " error message -$ECM -bloads ${GMPECM_DATADIR}/test_dummy2.save 1e6 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +printf "\r\n\r\n# this is a comment line and should be ignored\r\n" > test_dummy2.save +$ECM -param 1 -bloads test_dummy2.save 1e6 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +$ECM -param 1 -bloads test_dummy2.save 1e6 < ${GMPECM_DATADIR}/c155 2>&1 | grep "0 bytes read from test_dummy2.save"; checkcode $? 0 +/bin/rm -f test_dummy2.save # exercise batch error messages... -$ECM -bsaves test.ecm.s 11e3 < ${GMPECM_DATADIR}/c155 -$ECM -bloads test.ecm.s 1000 < ${GMPECM_DATADIR}/c155; checkcode $? 1 -$ECM -bloads test.ecm.s 10900 < ${GMPECM_DATADIR}/c155; checkcode $? 1 -/bin/rm -f test.ecm.s +TEST=test.ecm.s$$ +$ECM -bsaves $TEST 11e3 < ${GMPECM_DATADIR}/c155 +$ECM -bloads $TEST 1000 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +$ECM -bloads $TEST 10900 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +/bin/rm -f $TEST -# exercise "Error, -bsaves/-bloads makes sense in batch mode only" error message +# exercise "Error, -bsaves makes sense in batch mode only" error message $ECM -bsaves test.s -param 0 11e3 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +# exercise "Error, -bloads makes sense in batch mode only" error message +$ECM -bsaves test.s -param 1 11e3 < ${GMPECM_DATADIR}/c155 +$ECM -bloads test.s -param 0 11e3 < ${GMPECM_DATADIR}/c155; checkcode $? 1 +/bin/rm -f test.s # exercise "Error, invalid parametrization." error message $ECM -param 5-3 11e3 < ${GMPECM_DATADIR}/c155; checkcode $? 1 @@ -301,84 +348,73 @@ echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -param 0 -sigma 585928442 174000; checkcode $? 14 # test -save/-resume -/bin/rm -f test.ecm.save -echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -save test.ecm.save -param 0 -sigma 585928442 174000 0 -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 14 +TEST=test.ecm.save$$ +/bin/rm -f $TEST +echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -save $TEST -param 0 -sigma 585928442 174000 0 +$ECM -resume $TEST 174000 85880350 +checkcode $? 14 -# test savea -cp ${GMPECM_DATADIR}/test_dummy2.save test.ecm.save -echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -savea test.ecm.save -param 0 -sigma 585928442 174000 0 -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 14 +# test savea with existing file +printf "\r\n\r\n# this is a comment line and should be ignored\r\n" > $TEST +echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -savea $TEST -param 0 -sigma 585928442 174000 0 +$ECM -resume $TEST 174000 85880350 +checkcode $? 14 # test unknown method -echo "METHOD=FOO" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "METHOD=FOO" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test invalid checksum -echo "CHECKSUM=xxx" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "CHECKSUM=xxx" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test comment -echo "METHOD=P-1;X=1;N=17;B1=2;COMMENT=this is a comment;" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 8 +echo "METHOD=P-1;X=1;N=17;B1=2;COMMENT=this is a comment;" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 8 # test invalid param -echo "PARAM=xxx" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "PARAM=xxx" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test invalid etype -echo "ETYPE=xxx" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "ETYPE=xxx" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test invalid B1 -echo "B1=xxx" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "B1=xxx" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test unknown tag -echo "FOO=xxx" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "FOO=xxx" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test semicolon after tags -echo "METHOD=P-1;X=1;N=17;B1=2" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "METHOD=P-1;X=1;N=17;B1=2" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test QX tag without sigma -echo "METHOD=ECM;QX=1;N=17;B1=2;" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "METHOD=ECM;QX=1;N=17;B1=2;" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 # test missing fields -echo "METHOD=ECM;QX=1;N=17;" > test.ecm.save -$ECM -resume test.ecm.save 174000 85880350 -C=$? -checkcode $C 0 +echo "METHOD=ECM;QX=1;N=17;" > $TEST +$ECM -resume $TEST 174000 85880350 +checkcode $? 0 -/bin/rm -f test.ecm.save -echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -save test.ecm.save -A 22/7 -x0 1/3 -y0 2/7 1e3 -$ECM -resume test.ecm.save 1e3 +/bin/rm -f $TEST +echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -save $TEST -A 22/7 -x0 1/3 -y0 2/7 1e3 +$ECM -resume $TEST 1e3 C=$? -/bin/rm -f test.ecm.save +/bin/rm -f $TEST checkcode $C 0 $ECM -resume ${GMPECM_DATADIR}/test_dummy.save 1e3; checkcode $? 0 @@ -418,7 +454,8 @@ # in mulredc*.asm. This input has 1363 bits so it has 22 64 bit words # (43 32 bit words) and cannot use mulredc which handles only up to 20 limbs -echo "10090030271*10^400+696212088699" | $ECM -param 0 -sigma 3923937547 1e3 1e6; checkcode $? 14 +# Test APR primality test +echo "10090030271*(10^400+69)" | $ECM -param 0 -sigma 3923937547 1e3 1e6; checkcode $? 14 echo 31622776601683791911 | $ECM -sigma 0:249908706013996416 11000; checkcode $? 0 @@ -440,11 +477,12 @@ checkcode $? 8 # test -bsaves/-bloads -echo 18446744073709551557 | $ECM -param 1 -A 1 -bsaves test.ecm.s 11000 -echo 18446744073709551557 | $ECM -param 1 -A 312656731337392125 -bloads test.ecm.s 11000; checkcode $? 8 +TEST=test.ecm.s$$ +echo 18446744073709551557 | $ECM -param 1 -A 1 -bsaves $TEST 11000 +echo 18446744073709551557 | $ECM -param 1 -A 312656731337392125 -bloads $TEST 11000; checkcode $? 8 # The following test (with -v and -bloads) would crash on Windows prior to svn 2968. -echo 18446744073709551557 | $ECM -param 1 -A 312656731337392125 -bloads test.ecm.s -v 11000; checkcode $? 8 -/bin/rm -f test.ecm.s +echo 18446744073709551557 | $ECM -param 1 -A 312656731337392125 -bloads $TEST -v 11000; checkcode $? 8 +/bin/rm -f $TEST # non-regression test for bug fixed by changeset r1819 on 32-bit echo 4294967291 | $ECM -param 1 -A 17 1000 @@ -498,15 +536,55 @@ echo "Phi(101,30)" | $ECM -sigma 0:12023436370081639188 1e5 checkcode $? 14 +echo "Phi(202,-30)" | $ECM -sigma 0:12023436370081639188 1e5 +checkcode $? 14 + echo "1+Phi(102,1)" | $ECM -sigma 0:12023436370081639188 1e5 checkcode $? 8 echo "Phi(101,1)" | $ECM -sigma 0:12023436370081639188 1e5 checkcode $? 8 +echo "Phi(101^3,1)" | $ECM -sigma 0:12023436370081639188 1e5 +checkcode $? 8 + echo "17+Phi(1,2)" | $ECM -sigma 0:17 1e3 checkcode $? 6 +if [ "$MUL" != "redc"]; then + +# tests to exercise gcd code in eval.c +echo "gcd(Phi(44,968),968^11-44*968^5+1)" | $ECM -sigma 1:3751794696 2e3 +checkcode $? 14 + +# tests to exercise the Aurifeullian code in eval.c +echo "PhiL(44,968)" | $ECM -sigma 1:3751794696 2e3 +checkcode $? 14 + +echo "PhiM(630,3)" | $ECM -sigma 1:1931892209 2e3 +checkcode $? 6 + +echo "PhiL(525,5)" | $ECM -sigma 1:2261023611 200 +checkcode $? 14 + +echo "PhiM(180,150)" | $ECM -sigma 1:3327014026 11e3 +checkcode $? 10 + +echo "PhiL(350,7)/3851" | $ECM -sigma 1:283849997 11e3 +checkcode $? 14 + +echo "PhiM(140,10)" | $ECM -sigma 1:500092364 11e3 +checkcode $? 10 + +echo "PhiL(242,44)" | $ECM -sigma 1:1028204016 11e3 +checkcode $? 10 + +# tests to exercise U and primU code in eval.c +echo "primU(5,-1,160)" | $ECM -sigma 1:450587089 11e3 +checkcode $? 10 + +fi + # on systems with 64-bit limbs, exercise mulredc9 echo "2^567-181" | $ECM -sigma 0:2521899833399249862 11000; checkcode $? 14 # on systems with 64-bit limbs, exercise mulredc10 @@ -574,12 +652,14 @@ # exercise memory leak echo 2050449353925555290706354283 | $ECM -param 0 -sigma 7 30 308 -##### tests for the Weierstrass form +##### tests for Weiestrass forms ##### (disabled in revision 2765 since not tested enough) -if [ 0 -eq 1 ]; then +if [ 1 -eq 1 ]; then # factored with Step 1 echo 2432902008176640001 | $ECM -param 5 -A 1 -x0 20 -y0 10 100000; checkcode $? 14 +# missed using a 2-torsion point +echo 2432902008176640001 | $ECM -param 5 -A 1 -x0 1411930621319333529 -y0 0 100000; checkcode $? 8 # unfactored echo 2432902008176640001 | $ECM -param 5 -A 1 -x0 20/3 -y0 10 10000 20000; checkcode $? 0 # factored with default Step 2 @@ -602,6 +682,11 @@ echo 2432902008176640001 | $ECM -param 5 -A 1/20639383 -x0 20 -y0 10 10000; checkcode $? 14 echo 2432902008176640001 | $ECM -param 5 -A 1 -x0 20/20639383 -y0 10 10000; checkcode $? 14 echo 2432902008176640001 | $ECM -param 5 -A 1/2 -x0 20 -y0 10/20639383 10000; checkcode $? 14 +fi # tests for Weierstrass form + +##### tests for Hessian forms +if [ 1 -eq 1 ]; then + # Hessian form: X^3+Y^3+Z^3=3*D*X*Y*Z (torsion group Z3xZ3 over Q(sqrt(-3))) # found in Step 1 echo 2432902008176640001 | $ECM -param 6 -A 2 -x0 2 -y0 3 9000; checkcode $? 14 @@ -609,36 +694,105 @@ echo 2432902008176640001 | $ECM -param 6 -A 2 -x0 2 -y0 3 100 9000; checkcode $? 14 # no factor found echo 2432902008176640001 | $ECM -param 6 -A 2 -x0 2 -y0 3 1000 2000; checkcode $? 0 +# JKL-ECM in twisted Hessian form +# that is a*X^3+Y^3+Z^3 = d*X*Y*Z +echo 2432902008176640001 | $ECM -param 7 -A 125/499 -x0 19/485 -y0 5/97 1e5; checkcode $? 14 +echo 2432902008176640001 | $ECM -param 7 -A 125/123 -x0 2 -y0 13 1e5; checkcode $? 14 +# found in Step 2 +echo 2432902008176640001 | $ECM -param 7 -A 125/123 -x0 2 -y0 13 1e4 1e5; checkcode $? 14 -fi # tests for Weierstrass form +fi # tests for Hessian form ##### tests for curves with specific torsion ##### (disabled in revision 2764 since not tested enough) -if [ 0 -eq 1 ]; then +if [ 1 -eq 1 ]; then +##### Z5 echo 4722366482800925736961 | $ECM -torsion Z5 -sigma 2 1e5; checkcode $? 14 ## a factor is to be found during initialization echo 12787261 | $ECM -torsion Z5 -sigma 1000 1e2; checkcode $? 14 ## a factor is to be found during initialization in cubic_to_quartic +##### Z7 echo 123041 | $ECM -torsion Z7 -sigma 2 1e2; checkcode $? 14 +## found factor during init of Q in Z7 +echo 123041 | $ECM -torsion Z7 -sigma 10 1e2; checkcode $? 14 ## found factor during update of Q in Z7 echo 376171002633197 | $ECM -torsion Z7 -sigma 5 1e2; checkcode $? 14 +## found factor during update of Q in Z7 +echo 376171002633197 | $ECM -torsion Z7 -sigma -5 1e2; checkcode $? 14 ## a factor is found echo 13290059 | $ECM -torsion Z7 -sigma 4 1e2; checkcode $? 14 ## in step 2 echo 2432902008176640001 | $ECM -torsion Z7 -sigma 1 1e3 1e8; checkcode $? 14 ## -save/-resume -echo 2432902008176640001 | $ECM -torsion Z7 -save test.ecm.save -sigma 1 1e3 -$ECM -resume test.ecm.save 1e3 1e8 +TEST=test.ecm.save$$ +echo 2432902008176640001 | $ECM -torsion Z7 -save $TEST -sigma 1 1e3 +$ECM -resume $TEST 1e3 1e8 C=$? -/bin/rm -f test.ecm.save +/bin/rm -f $TEST checkcode $C 14 +##### Z9 +## found factor during init of Q in Z9 +echo 874700000026241 | $ECM -torsion Z9 -sigma 10 1e2; checkcode $? 14 ## found factor during update of Q in Z9 echo 874700000026241 | $ECM -torsion Z9 -sigma 7 1e2; checkcode $? 14 ## found factor in Step 1 echo 2432902008176640001 | $ECM -torsion Z9 -sigma 3 13000; checkcode $? 14 +## found factor in Z9 (cubic_2_quartic) +echo 1007 | $ECM -torsion Z9 -sigma 7 1e2; checkcode $? 14 +##### Z10 +## found factor in Step 1 echo 2432902008176640001 | $ECM -torsion Z10 -sigma 3 320; checkcode $? 14 +## skipping u=2 +echo 2432902008176640001 | $ECM -torsion Z10 -sigma 2 320; checkcode $? 14 +## found factor during update of Q in Z10 +echo 871 | $ECM -torsion Z10 -sigma 9 1e2; checkcode $? 14 +## found factor in Z10 (cubic_2_quartic) +echo 1703 | $ECM -torsion Z10 -sigma 7 1e2; checkcode $? 14 +## inverse found in Z10 (d) +echo 122473 | $ECM -torsion Z10 -sigma 7 1e2; checkcode $? 14 +##### Z2xZ8 echo 2432902008176640001 | $ECM -torsion Z2xZ8 -sigma 2 1300; checkcode $? 14 +# found factor during init of Q in Z2xZ8 +echo 923 | $ECM -torsion Z2xZ8 -sigma 10 1e2; checkcode $? 14 +# found factor in Z2xZ8 (update of Q) +echo 923 | $ECM -torsion Z2xZ8 -sigma 7 1e2; checkcode $? 14 +# found factor in Z2xZ8 (beta) +echo 1963 | $ECM -torsion Z2xZ8 -sigma 7 1e2; checkcode $? 14 +# found factor in Z2xZ8 (d) +echo 533 | $ECM -torsion Z2xZ8 -sigma 7 1e2; checkcode $? 14 +# found factor in Z2xZ8 (d2) +echo 169 | $ECM -torsion Z2xZ8 -sigma 7 1e2; checkcode $? 14 +# found factor in Z2xZ8 (mb) +echo 5776889 | $ECM -torsion Z2xZ8 -sigma 7 1e2; checkcode $? 14 +# found factor in Z2xZ8 (alpha) +echo 3299173 | $ECM -torsion Z2xZ8 -sigma 8 1e2; checkcode $? 14 +##### Z3xZ3 +## abnormal case +echo 2432902008176640001 | $ECM -torsion Z3xZ3 -sigma 2 5; checkcode $? 8 +## found factor in Step 1 +echo 2432902008176640001 | $ECM -torsion Z3xZ3 -sigma 3 1e5; checkcode $? 14 +echo 2432902008176640001 | $ECM -torsion Z3xZ3 -sigma -3 1e5; checkcode $? 14 +## found factor in Z3xZ3 (D) +echo 171523 | $ECM -torsion Z3xZ3 -sigma 11 1e5; checkcode $? 6 +## D^3=1 => singluar curve +echo 217 | $ECM -torsion Z3xZ3 -sigma 11 1e5; checkcode $? 1 +##### Z3xZ6 +## found factor in Step 1 +echo 2432902008176640001 | $ECM -torsion Z3xZ6 -sigma 11 1e3; checkcode $? 14 +## found factor in Z3xZ6 (update of Q) +echo 101303039 | $ECM -torsion Z3xZ6 -sigma 1014 1e5; checkcode $? 14 +## found factor in Z3xZ6 (D) +echo 115 | $ECM -torsion Z3xZ6 -sigma 10 1e5; checkcode $? 14 +##### Z4xZ4 +## found factor in Step 1 +echo 1022117 | $ECM -torsion Z4xZ4 -sigma 8 5; checkcode $? 14 +# Factor found during init of Z4xZ4 (mb) +echo 169 | $ECM -torsion Z4xZ4 -sigma 8 1e2; checkcode $? 14 +# Factor found during init of Z4xZ4 (tau) +echo 115 | $ECM -torsion Z4xZ4 -sigma 10 1e2; checkcode $? 14 +## error on torsion group +echo 2432902008176640001 | $ECM -torsion ZZ -sigma 2 1300; checkcode $? 1 fi # tests with -torsion diff -Nru gmp-ecm-7.0.4+ds/test.gpuecm gmp-ecm-7.0.5+ds/test.gpuecm --- gmp-ecm-7.0.4+ds/test.gpuecm 2016-07-25 14:11:06.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test.gpuecm 2022-06-06 14:16:49.000000000 +0000 @@ -21,13 +21,12 @@ # http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., # 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. -ECM="${1:-./ecm} -gpu -gpucurves 1" -ECMnoGPU="${1-./ecm}" +ECM="${1:-./ecm} -gpu -gpucurves 32" +ECMnoGPU=$(echo "${1-./ecm}" | sed -e "s/-gpu//" -e "s/-cgbn//") GMPECM_DATADIR=${GMPECM_DATADIR:-.} # Call with "checkcode $? n" to check that return code is n -# (see test.pm1 for the explanation of the different return codes) checkcode () { if [ $1 != $2 ] then @@ -46,6 +45,33 @@ # 10 Composite factor found, cofactor is a probable prime # 14 Probable prime factor found, cofactor is a probable prime +# Call with "checkfactorcount $FILE $step1 $step2 $unique" to check that +# step 1 finds $step1 factors and step 2 finds $step2 factors. +# and that $unique factors are found +checkfactorcount () { + step1found=$(grep -c 'factor [0-9]* found in Step 1' < $1) + step2found=$(grep -c 'factor [0-9]* found in Step 2' < $1) + uniquefound=$(grep -c 'Factor found in step' < $1) + if [ $step1found != $2 ] + then + echo "################### ERROR ###################" + echo "Expected to find $2 factors in stage 1 found $step1found" + exit 1 + fi + if [ $step2found != $3 ] + then + echo "################### ERROR ###################" + echo "Expected to find $3 factors in stage 2 found $step2found" + exit 1 + fi + if [ $uniquefound != $4 ] + then + echo "################### ERROR ###################" + echo "Expected to find $4 unique factors found $uniquefound" + exit 1 + fi +} + # test for stage 1 on GPU echo 458903930815802071188998938170281707063809443792768383215233 | $ECM -sigma 3:227 125 0 checkcode $? 14 @@ -118,7 +144,7 @@ # bug found and patched by Greg Childers: resuming from gpu generated save file didn't work -# Details: https://lists.gforge.inria.fr/pipermail/ecm-discuss/2016-April/004348.html +# Details: https://sympa.inria.fr/sympa/arc/ecm-discuss/2016-04/msg00012.html # fixed with svn 2915. several test cases given to exercise patch: /bin/rm -f test.ecm.save # Save file with no factor found in step 1 or step 2 @@ -164,32 +190,60 @@ checkcode $? 6 /bin/rm -f test.ecm.save -echo "" > test_dummy2.save -echo '\r' >> test_dummy2.save -echo '\r# this is a comment line and should be ignored' >> test_dummy2.save +printf "\r\n\r\n# this is a comment line and should be ignored\r\n" > test_dummy2.save # test -save/-resume /bin/rm -f test.ecm.save echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -save test.ecm.save -param 3 -sigma 3781882524 9000 0 $ECMnoGPU -resume test.ecm.save 9000 1000000 -C=$? -checkcode $C 14 +checkcode $? 14 # test savea cp test_dummy2.save test.ecm.save echo 17061648125571273329563156588435816942778260706938821014533 | $ECM -savea test.ecm.save -param 3 -sigma 3781882524 9000 0 $ECMnoGPU -resume test.ecm.save 9000 1000000 -C=$? -checkcode $C 14 +checkcode $? 14 -/bin/rm test_dummy2.save test.ecm.save +# verify gpu vs cpu save +/bin/rm -f test.ecm.save test.ecm2.save +for param in `seq 1000 1031` +do + echo "2^293-1" | $ECMnoGPU -q -savea test.ecm.save -sigma 3:$param 1e3 0 > test.output + checkcode $? 0 +done +echo "2^293-1" | $ECM -savea test.ecm2.save -sigma 3:1000 1e3 0 +# truncate some trailing fields +sed -i 's/ PROGRAM.*//' test.ecm.save test.ecm2.save +diff test.ecm.save test.ecm2.save +checkcode $? 0 # find multiple factors in Step 1 -echo "(2^718+1)/5" | $ECM -sigma 3:2000 50 60 +echo "(2^718+1)/5" | $ECM -sigma 3:2000 50 60 > test.output checkcode $? 2 +checkfactorcount test.output 2 0 2 # find multiple factors in Step 2 -echo "(2^718+1)/5" | $ECM -sigma 3:2000 40 60 +echo "(2^718+1)/5" | $ECM -sigma 3:2000 40 60 > test.output +checkcode $? 2 +checkfactorcount test.output 0 3 2 + +# find multiple identical factors in Step 1 +echo "2^139-1" | $ECM -sigma 3:1600 7000 0 > test.output +checkcode $? 14 +checkfactorcount test.output 4 0 1 + +# find multiple identical factors in Step 2 +echo "2^139-1" | $ECM -sigma 3:1800 1900 8000 > test.output +checkcode $? 14 +checkfactorcount test.output 0 3 1 + +# test finding multiple overlapping factors in Step 1 +echo "101^2*1013*1000003*1000033*1000037*1000039*1000081^2*(2^89-1)^2" | $ECM -gpucurves 1024 -sigma 3:1000 10000 > test.output checkcode $? 2 +# 101^2*1013 and 1000081^2 are always found together as composite factor +checkfactorcount test.output 1024 0 6 + +/bin/rm test_dummy2.save test.ecm.save test.ecm2.save test.output +echo "$ECM" echo "All ECM tests with GPU are ok." diff -Nru gmp-ecm-7.0.4+ds/testlong.ecm gmp-ecm-7.0.5+ds/testlong.ecm --- gmp-ecm-7.0.4+ds/testlong.ecm 2016-06-27 11:59:23.000000000 +0000 +++ gmp-ecm-7.0.5+ds/testlong.ecm 2022-06-06 14:16:49.000000000 +0000 @@ -55,9 +55,10 @@ echo "(2^23209+1)/389100550245753" | $ECM -param 0 -sigma 1403722985 -go "1043173*1363273*1577143" 29411 1046754883; checkcode $? 6 # from https://groups.google.com/forum/#!topic/Mersenneplustwo/XKAkqQ3_ni4 -# this example is too expensive, we skip it +# skip this very expensive example (takes 40+ minutes) +# echo "(2^32582657+1)/3" | $ECM -param 0 -sigma 1483035008190041 4423 481199 -# check that primality test of cofactor is not too expensive +# check that primality test of cofactor(777450 digits) is not too expensive echo "2^2582657+1" | $ECM -param 0 -sigma 1483035008190041 46 481199; checkcode $? 2 # p54 factor from F_12 found on March 27, 2010 by Michael Vang diff -Nru gmp-ecm-7.0.4+ds/test_M877.save gmp-ecm-7.0.5+ds/test_M877.save --- gmp-ecm-7.0.4+ds/test_M877.save 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test_M877.save 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1 @@ +N=0x2BA2CB2D2B2B53F996E46EB8A8A77F99BD1FF5C7DDE2C59A371A0B3BAF2337D599C4EF67BAE800CC819B18B3247A51CC43AA8C5280D8D3B2C2E329990409761164A109DB965B7C986395967BFCAA52930DC46137E82CA3BB57AAA7FFF7432EC7E4D93EB319ACB2CEA29; QX=0x165D71C162D3B11A4AD773866FDBFEEFC1CFBAE3E52EA13B3DCAF6D9F6976A7B371E86180EDC8A230DA629111B61734843224DA6A0239CF839D48A47B4916F296768D57E23BCEE723081D4F781B4C28AFEED24A9F3A2C9D79BF9CA4F04D35AAF3F2CB41A2360656AA90; SIGMA=4985522057933944 diff -Nru gmp-ecm-7.0.4+ds/test.pm1 gmp-ecm-7.0.5+ds/test.pm1 --- gmp-ecm-7.0.4+ds/test.pm1 2016-08-29 06:27:01.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test.pm1 2022-06-06 14:16:49.000000000 +0000 @@ -73,7 +73,7 @@ # test with -go echo 563796628294674772855559264041716715663 | $PM1 -go 4031563 67801 14334623; checkcode $? 8 -# http://lists.gforge.inria.fr/pipermail/ecm-discuss/2013-March/004214.html +# https://sympa.inria.fr/sympa/arc/ecm-discuss/2013-03/msg00000.html echo 563796628294674772855559264041716715663 | $PM1 -go 39331109600487907694228112175794 1 14334623; checkcode $? 8 echo 188879386195169498836498369376071664143 | $PM1 3026227 99836987; checkcode $? 8 @@ -85,27 +85,29 @@ echo 2124306045220073929294177 | $PM1 290021 1193749003; checkcode $? 8 ### Try saving and resuming -echo 25591172394760497166702530699464321 | $PM1 -save test.pm1.save 100000 +TEST=test.pm1.save$$ +echo 25591172394760497166702530699464321 | $PM1 -save $TEST 100000 checkcode $? 0 -$PM1 -resume test.pm1.save 120557 2007301 +$PM1 -resume $TEST 120557 2007301 C=$? -/bin/rm -f test.pm1.save +/bin/rm -f $TEST checkcode $C 8 # check the -chkpnt option -echo 25591172394760497166702530699464321 | $PM1 -chkpnt test.pm1.chk 100000 +TEST=test.pm1.chk$$ +echo 25591172394760497166702530699464321 | $PM1 -chkpnt $TEST 100000 checkcode $? 0 -$PM1 -resume test.pm1.chk 120557 2007301 +$PM1 -resume $TEST 120557 2007301 C=$? -/bin/rm -f test.pm1.chk +/bin/rm -f $TEST checkcode $C 8 ### same with -savea -echo 25591172394760497166702530699464321 | $PM1 -savea test.pm1.save 100000 +echo 25591172394760497166702530699464321 | $PM1 -savea $TEST 100000 checkcode $? 0 -$PM1 -resume test.pm1.save 120557 2007301 +$PM1 -resume $TEST 120557 2007301 C=$? -/bin/rm -f test.pm1.save +/bin/rm -f $TEST checkcode $C 8 # bug in ecm-5.0 (overflow in fin_diff_coeff) diff -Nru gmp-ecm-7.0.4+ds/test.pp1 gmp-ecm-7.0.5+ds/test.pp1 --- gmp-ecm-7.0.4+ds/test.pp1 2016-08-29 06:28:03.000000000 +0000 +++ gmp-ecm-7.0.5+ds/test.pp1 2022-06-06 14:16:49.000000000 +0000 @@ -60,10 +60,6 @@ echo 6215074747201 | $PP1 -x0 5 630 199729; checkcode $? 8 -# bug in 6.1.3 -echo 6215074747201 | $PP1 -power 2 -x0 5 630 199729; checkcode $? 8 -echo 6215074747201 | $PP1 -dickson 3 -x0 5 630 199729; checkcode $? 8 - echo 8857714771093 | $PP1 -x0 3 23251 49207; checkcode $? 8 echo 236344687097 | $PP1 -x0 3 619 55001; checkcode $? 8 @@ -104,20 +100,21 @@ checkcode $? 14 # test -save/-resume -echo 2277189375098448170118558775447117254551111605543304035536750762506158547102293199086726265869065639109 | $PP1 -x0 3 -save test.pp1.save 1000000 0 +TEST=test.pp1.save$$ +echo 2277189375098448170118558775447117254551111605543304035536750762506158547102293199086726265869065639109 | $PP1 -x0 3 -save $TEST 1000000 0 checkcode $? 0 -$PP1 -resume test.pp1.save 2337233 132554351 +$PP1 -resume $TEST 2337233 132554351 C=$? +/bin/rm -f $TEST checkcode $C 14 -/bin/rm -f test.pp1.save # test -chkpnt -echo "chkpnt" -echo 2277189375098448170118558775447117254551111605543304035536750762506158547102293199086726265869065639109 | $PP1 -x0 3 -chkpnt test.pp1.chk 1000000 0 +TEST=test.pp1.chk$$ +echo 2277189375098448170118558775447117254551111605543304035536750762506158547102293199086726265869065639109 | $PP1 -x0 3 -chkpnt $TEST 1000000 0 checkcode $? 0 -$PP1 -resume test.pp1.chk 2337233 132554351 +$PP1 -resume $TEST 2337233 132554351 C=$? -/bin/rm -f test.pp1.chk +/bin/rm -f $TEST checkcode $C 14 # bug in ecm-5.0 (overflow in fin_diff_coeff) diff -Nru gmp-ecm-7.0.4+ds/TODO gmp-ecm-7.0.5+ds/TODO --- gmp-ecm-7.0.4+ds/TODO 2013-02-22 14:00:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/TODO 2022-06-06 14:16:49.000000000 +0000 @@ -81,12 +81,12 @@ thesis (request of Peter-Lawrence Montgomery) - Torbj"orn Granlund suggested faster code for mpn_mod_1(), used extensively in NTT. See - http://lists.gforge.inria.fr/pipermail/ecm-discuss/2008-May/003365.html + https://sympa.inria.fr/sympa/arc/ecm-discuss/2008-05/msg00000.html 2) interface - from Mark Rodenkirch 08 April 2011: print messages like "Step 1: 1500000/100000000" with a command-line option (or with -v) - http://lists.gforge.inria.fr/pipermail/ecm-discuss/2011-April/004088.html + https://sympa.inria.fr/sympa/arc/ecm-discuss/2011-04/msg00000.html - with -resume, print %time for THIS RUN instead of total run? [suggested by SleepHound ] Add CPUTIME=... in the save file, to take into account the total cpu time @@ -111,7 +111,6 @@ 5) bugs 6) others -- add primality proving of factors/cofactors? Maybe link Pari for this? - add point counting algorithm? SEA implementation exists for Pari/GP, use that? - let user specify previous factoring work, compute distribution of diff -Nru gmp-ecm-7.0.4+ds/TODO.gpu gmp-ecm-7.0.5+ds/TODO.gpu --- gmp-ecm-7.0.4+ds/TODO.gpu 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/TODO.gpu 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,45 @@ +Last Reviewed: Seth T. 2021-10-14 + +Quality of Life + +1. Verify kernel_info.maxThreadsPerBlock <= TPB_DEFAULT in BIT select loop. +2. Add info on what GPU kernels are available. + +Big improvements + +1. Automatic tuning for TPB, gpucurves + gpu_throughput_test could be automated in the code and run when B1 > THRESHOLD + Getting the wrong gpucurves / TPB pair can cost 2x/4x performance. + +2. Flag to sleep between kernel calls + This reduces performance but can help the responsiveness of the computer. + Even 5ms of pause (~7% performance) made the system seem much more stable. + +3. Some benchmark (gpu_throughput_test.sh?) to prevent performance regressions. + Ideally it would record + +Testing + +1. Improve carry bit testing (see overflow test in check_gpuecm.sage) +2. Ask a C++ person and verify that GPU and CPU both use same endianness + This could affect `to_mpz`, `from_mpz`, `allocate_and_set_s_bits`, and `set_p_2p` + This is possibly handled by `endian` parameter in mpz_export + +Several things have been tried to improve performance that didn't turn out. These are recorded +so we don't forget. + +1. Branchless + if(carry) { cgbn_sub(r, r, modulus) } + can be replaced with + cgbn_sub(r, r, zero_or_modulus[carry]); + In theory this should help the different threads all stay alligned, in practice it didn't + +2. Reducing number of reserved carry bits + CARRY_BITS can be reduced form 6 to 2(?) be checking carry bit in addition to overflow + after cgbn_add. This increase the size of number that can be run in cgbn_kernel. + There's some performance penenalty for this so the code is uncommitted, but this is a big + improvement for numbers 506-510 and 1018-1022 bits. + +3. Removing the bn_t variables CB,DA,AA,BB,k,dK + Only 2 (or possible 3) temporary variables are needed. This change didn't impact performance + and hurt readability so was backed out. It's always possible that it could reduce registers diff -Nru gmp-ecm-7.0.4+ds/TODO.sp gmp-ecm-7.0.5+ds/TODO.sp --- gmp-ecm-7.0.4+ds/TODO.sp 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/TODO.sp 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,55 @@ +TODOs: + +General: + +1) Check for malloc errors. + +2) grep FIXME *.c *.h + +asm level: + +All the SP code is *heavily* dependent on the C compiler. The performance gap +between unoptimized and -O3 code is huge. Maybe a couple of asm routines would +be appropriate if some kind soul were willing to write them. + +1) spv_dotproduct. Given spvs a and b, compute + a[0] * b[0] + ... a[n-1] * b[n-1] mod p + + Useful in mpzspp_normalize. + +2) spv_ntt_gfp_dif, spv_ntt_gfp_dit. Most of stage 2 is spent on these + functions; maybe a hand-written version will outperform gcc's efforts. + + +SP level: + +1) Maybe use 64->32 bit Montgomery reduction instead of GMP's + udiv_qrnnd_preinv2norm. Preliminary tests indicate this gives a 10% speedup + in ntt_gfp_dif. See sp_montmul in sp.h. + + UPDATE: This code has been written (but is not in cvs). It turns out that + sp_montmul as presented is incorrect as we have to check for + overflows. Unfortunately this cancels out the 10% speedup. + +SPV level: + +1) toom3 and toom4 have been translated directly from toomcook.c but seem to be + very slow. Try lowering the operation count (i.e. increase the number of + muls) and see if we can fit a toom3/4 in-between karatsuba and ntt. + +4) Try out the NTT over GP(p^2). Old code doing this exists (DN), I'll see if + I can get it to work at some point. This is why the function is called + spv_ntt_gfp_dif, not just spv_ntt_dif. + +5) Split-radix NTT. Old code doing this exists (DN), etc... I read somewhere + that "A well-implemented radix-2 NTT can outperform a badly-implemented + split-radix NTT" + +MPZSPP level: + +1) Find a better way of doing mpzspp_normalize. + +2) Use a division tree for mpzspp_set_mpzp. Note however that these aren't + too time-critical if we rewrite the high-level poly functions (see below) + so they don't convert to/from mpzspps for every poly mul. + diff -Nru gmp-ecm-7.0.4+ds/torsions.c gmp-ecm-7.0.5+ds/torsions.c --- gmp-ecm-7.0.4+ds/torsions.c 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/torsions.c 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,1417 @@ +/* torsions.c - ECM with special torsion curves + Author: F. Morain +*/ + +#include +#include +#include +#include + +#include /* GMP header file */ + +#include "ecm.h" /* ecm header file */ +#include "ecm-impl.h" +#include "ecm-ecm.h" +#include "mpmod.h" + +#include "addlaws.h" +#include "torsions.h" + +#define DEBUG_TORSION 0 + +/* We use three variants of Weierstrass parametrization: + CW (complete): y^2+a1*x*y+a3*y=x^3+a2*x^2+a4*x+a6 + MW (medium) : y^2=x^3+a2*x^2+a4*x+a6 + SW (short) : y^2=x^3+a4*x+a6 + + A Kubert curve is the special case Y^2+(1-c)*X*Y-b*Y = X^3-b*X^2 + + Generally, we build a curve under the SW form, with affine law, meaning + that constructed points will be [x, y, 1]. + */ + +/********** utilities **********/ + +void +mod_div_2(mpz_t x, mpz_t n) +{ + if(mpz_tstbit(x, 0)){ + /* x is odd, x/2 = (x+N)/2 */ + mpz_add(x, x, n); + mpz_div_2exp(x, x, 1); + } + else + /* x is even, x/2 is easy */ + mpz_div_2exp(x, x, 1); +} + +/* r <- q mod N. + Return value: 1 if den invertible, 0 if factor found; in this case + gcd(den(q), N) is put in r. + */ +int +mod_from_rat2(mpz_t r, mpz_t num, mpz_t den, mpz_t N) +{ + int ret = 1; + + if(mpz_invert(r, den, N) == 0){ + mpz_gcd(r, den, N); + ret = 0; + } + else{ + mpz_mul(r, r, num); + mpz_mod(r, r, N); + } + return ret; +} + +int +mod_from_rat_str(mpz_t r, char *str, mpz_t N) +{ + mpq_t q; + int ret; + + mpq_init(q); + mpq_set_str(q, str, 10); + ret = mod_from_rat2(r, mpq_numref(q), mpq_denref (q), N); + mpq_clear(q); + return ret; +} + +/* From a curve in Kubert form Y^2+(1-c)*X*Y-b*Y = X^3-b*X^2 + to a Weierstrass form y^2 = X^3 + a2 * X^2 + a4 * X + a6 + where y = Y+((1-c)*X-b)/2 + WE:=[0,(1/4*c^2+1/4-1/2*c-b),0,(1/2*c*b-1/2*b),1/4*b^2]); + We compute: + a2 = 1/4*c^2+1/4-1/2*c-b = ((c-1)/2)^2-b + a4 = 1/2*c*b-1/2*b = b*(c-1)/2 + a6 = (b/2)^2 + TODO: rewrite this with MediumW, etc. +*/ +void +KW2W246(mpz_t a2, mpz_t a4, mpz_t a6, mpz_t b, mpz_t c, mpz_t n, int compute_a6) +{ + /** a4 <- (c-1)/2 **/ + mpz_sub_si(a4, c, 1); + mod_div_2(a4, n); + /** a2 <- a4^2-b **/ + mpz_mul(a2, a4, a4); + mpz_sub(a2, a2, b); + mpz_mod(a2, a2, n); + /** a4 <- a4*b **/ + mpz_mul(a4, a4, b); + mpz_mod(a4, a4, n); + if(compute_a6 != 0){ + mpz_set(a6, b); + mod_div_2(a6, n); + mpz_mul(a6, a6, a6); + mpz_mod(a6, a6, n); + } +#if DEBUG_TORSION >= 2 + gmp_printf("N:=%Zd;\n", n); + gmp_printf("b:=%Zd;\n", b); + gmp_printf("c:=%Zd;\n", c); + gmp_printf("a2:=%Zd;\n", a2); + gmp_printf("a4:=%Zd;\n", a4); + printf("a6:=RatMod(b^2/4, N);\n"); + if(compute_a6 != 0) + gmp_printf("a6:=%Zd;\n", a6); +#endif +} + +static int +check_weierstrass(mpz_t A, mpz_t B, mpz_t X, mpz_t Y, mpz_t tmp1, mpz_t tmp2, + mpz_t n) +{ + mpz_mul(tmp1, Y, Y); + mpz_mul(tmp2, X, X); + mpz_add(tmp2, tmp2, A); + mpz_mul(tmp2, tmp2, X); + mpz_add(tmp2, tmp2, B); + mpz_sub(tmp1, tmp1, tmp2); + mpz_mod(tmp1, tmp1, n); + return mpz_sgn(tmp1) == 0; +} + +/* Weierstrass (a2, a4, a6) to (A, B) + A = (a4-1/3*a2^2) + B = -1/3*a4*a2 + 2/27*a2^3 + a6 + = -1/3*a2*(a4-2/9*a2^2) + a6 + X = x+a2/3 + Y = y + INPUT: if x0 == NULL, we have no point to translate + if B == NULL, we do not need and we do not compute B + REM: we assume gcd(n, 3) = 1. +*/ +void +MediumWeierstrassToShortWeierstrass(mpz_t A, mpz_t B, mpz_t X, mpz_t Y, + mpz_t a2, mpz_t a4, mpz_t a6, + mpz_t x0, mpz_t y0, mpz_t n) +{ + mpz_t tmp1, tmp2, tmp3; + + mpz_init(tmp1); + mpz_init(tmp2); + /* tmp2 <- a2/3 */ + mpz_init_set_si(tmp3, 3); + mod_from_rat2(tmp2, a2, tmp3, n); + if(X != NULL && x0 != NULL){ + /* wx0 = x0 + a2/3 */ + mpz_add(X, tmp2, x0); + mpz_mod(X, X, n); + } + if(Y != NULL && y0 != NULL){ + mpz_set(Y, y0); + mpz_mod(Y, Y, n); + } + /* A = a4-1/3*a2^2 = a4 - a2 * (a2/3) */ + /** tmp1 <- tmp2*a2 = a2^2/3 */ + mpz_mul(tmp1, a2, tmp2); + mpz_mod(tmp1, tmp1, n); + mpz_sub(A, a4, tmp1); + mpz_mod(A, A, n); + if(B != NULL){ + /* B = -1/3*a2*(a4-2/9*a2^2) + a6 */ + /** B <- 2/9*a2^2 = 2 * (a2^2/3) / 3 **/ + mod_from_rat2(B, tmp1, tmp3, n); + mpz_mul_si(B, B, 2); + mpz_sub(B, a4, B); + mpz_mul(B, B, tmp2); + mpz_sub(B, a6, B); + mpz_mod(B, B, n); + } +#if DEBUG_TORSION >= 2 + gmp_printf("N:=%Zd;\n", n); + gmp_printf("a2:=%Zd; a4:=%Zd; a6:=%Zd;\n", a2, a4, a6); + gmp_printf("A:=%Zd; B:=%Zd;\n", A, B); + if(X != NULL && x0 != NULL){ + gmp_printf("x:=%Zd;\n", x0); + gmp_printf("X:=%Zd;\n", X); + } + if(Y != NULL && y0 != NULL){ + gmp_printf("y:=%Zd;\n", Y); + printf("(y^2-x^3-a2*x^2-a4*x-a6) mod N;\n"); + printf("(y^2-X^3-A*X-B) mod N;\n"); + } +#endif + mpz_clear(tmp1); + mpz_clear(tmp2); + mpz_clear(tmp3); +} + +/* + The original Kubert curve E(b, c) is y^2+(1-c)*x*y-b*y = x^3-b*x^2 + The medium Weierstrass form is y^2=x^3+a2*x^2+a4*x+a6 with point (x0, y0); + we convert this to short Weierstrass form: + E: Y^2 = X^3 + A * X + B + and point P=(X, Y) on E. +*/ +void +kubert_to_weierstrass(mpz_t A, mpz_t B, mpz_t X, mpz_t Y, + mpz_t b, mpz_t c, mpz_t x0, mpz_t y0, mpz_t n) +{ + mpz_t a2, a4, a6; + + mpz_init(a2); + mpz_init(a4); + mpz_init(a6); + KW2W246(a2, a4, a6, b, c, n, 1); + /* second conversion */ + MediumWeierstrassToShortWeierstrass(A, B, X, Y, a2, a4, a6, x0, y0, n); +#if DEBUG_TORSION >= 2 + gmp_printf("a2:=%Zd; a4:=%Zd; a6:=%Zd; A:=%Zd; B:=%Zd;\n", a2, a4, a6,A,B); + gmp_printf("X:=%Zd; Y:=%Zd;\n", X, Y); +#endif + mpz_clear(a2); + mpz_clear(a4); + mpz_clear(a6); +} + +static int +forbidden(char *torsion, int u){ + if(strcmp(torsion, "Z10") == 0) + return u == 1 || u == 2; + else if(strcmp(torsion, "Z3xZ3") == 0) + return u == 2; + return 0; +} + +/* Kubert: put b = c. + SIDE EFFECT: tE[0..nE[ and tP[0..nE[ receive a curve of torsion Z5 + and a point on it using parameters [smin..smax[. + OUTPUT: ECM_NO_FACTOR_FOUND or ECM_FACTOR_FOUND_STEP1 if a factor is found. +*/ +int +build_curves_with_torsion_Z5(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int smin, int smax, int nE) +{ + mpz_t A, B, X, Y; + int s, ret = ECM_NO_FACTOR_FOUND, nc = 0; + mpz_t x0, y0, c, tmp; + + mpz_init(A); + mpz_init(B); + mpz_init(X); + mpz_init(Y); + mpz_init(x0); + mpz_init(y0); + mpz_init(c); + mpz_init(tmp); + for(s = smin; s < smax; s++){ + mpz_set_si(x0, s); + /* c:=1/2*x0*(4*x0+1)/(3*x0+1); */ + /* y0 <- 2*(3*x0+1) */ + mpz_mul_si(y0, x0, 3); + mpz_add_si(y0, y0, 1); + /* tmp <- x0*(4*x0+1) */ + mpz_add(tmp, y0, x0); + mpz_mul(tmp, tmp, x0); + mpz_add(y0, y0, y0); + if(mod_from_rat2(c, tmp, y0, n->orig_modulus) == 0){ + printf("factor found during Z5_init\n"); + mpz_gcd(f, c, n->orig_modulus); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* y0:=x0*(x0+1)*(4*x0+1)/4/(3*x0+1) = (x0+1)*c/2 */ + mpz_add_si(y0, x0, 1); + mpz_mul(y0, y0, c); + mpz_mod(y0, y0, n->orig_modulus); + mod_div_2(y0, n->orig_modulus); +#if DEBUG_TORSION >= 2 + gmp_printf("x0:=%Zd;\nc:=%Zd;\ny0:=%Zd;\n", x0, c, y0); + printf("cr:=1/2*x0*(4*x0+1)/(3*x0+1);\n"); +#endif + /* P:=WE![x0, y0, 1]; */ + kubert_to_weierstrass(A, B, X, Y, c, c, x0, y0, n->orig_modulus); + if(check_weierstrass(A, B, X, Y, tmp, x0, n->orig_modulus) == 0){ + printf("#!# check_weierstrass false\n"); + ret = ECM_ERROR; + break; + } + ell_curve_init(tE[nc], ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE,n); + mpz_set(tE[nc]->a4, A); + mpz_set(tE[nc]->a6, B); + ell_point_init(tP[nc], tE[nc], n); + mpz_set(tP[nc]->x, X); + mpz_set(tP[nc]->y, Y); + nc++; + if(nc >= nE) + break; + } + mpz_clear(A); + mpz_clear(B); + mpz_clear(X); + mpz_clear(Y); + mpz_clear(x0); + mpz_clear(y0); + mpz_clear(c); + mpz_clear(tmp); + return ret; +} + +/* + E_aux: T^2 = S^3 + A * S + B + => quartic QC: Y^2 = X^4 - 6 * A2 * X^2 + 4 * A1 * X + A0, with + X = (T-A1/2)/(S-A2), Y = -X^2 + 2 * S + A2. + => quartic y^2 = f(x) = a4*x^4+...+a0, where + x = x0+y0/(X-cte), where cte = f'(x0)/4/y0 + y = Y/y0*(x-x0)^2 = Y*y0/(X-cte)^2 + INPUT: (s, t) is a point on E_aux; (x0, y0) a point on QC. + SIDE EFFECT: x, y contain a point on the elliptic curve. + OUTPUT: 1 if no pb occurred, + 0 if a factor was found and put in f + */ +int +cubic_to_quartic(mpz_t f, mpz_t n, mpz_t x, mpz_t y, + mpz_t s, mpz_t t, mpz_t A2, mpz_t A1div2, + mpz_t x0, mpz_t y0, mpz_t cte) +{ + mpz_t X, Y; + int ret = 1; + + mpz_init(X); + mpz_init(Y); + /* X <- (t-A1/2)/(s-A2) */ + mpz_sub(x, t, A1div2); + mpz_sub(y, s, A2); + if(mod_from_rat2(X, x, y, n) == 0){ + mpz_set(f, X); + ret = 0; + } + else{ + /* Y <- -X^2 + 2 * s + A2 */ + mpz_mul(Y, X, X); + mpz_sub(Y, A2, Y); + mpz_add(Y, Y, s); + mpz_add(Y, Y, s); + mpz_mod(Y, Y, n); + /* x <- x0+y0/(X-cte) */ + mpz_sub(X, X, cte); + mpz_mod(X, X, n); + if(mpz_invert(f, X, n) == 0){ + mpz_gcd(f, X, n); + ret = 0; + } + else{ + /* x <- y0/(X-cte) */ + mpz_mul(x, f, y0); + mpz_mod(x, x, n); + /* y <- x/(X-cte) = y0/(X-cte)^2 */ + mpz_mul(y, x, f); + mpz_mod(y, y, n); + mpz_mul(y, y, Y); + mpz_mod(y, y, n); + mpz_add(x, x, x0); + mpz_mod(x, x, n); + } + } + mpz_clear(X); + mpz_clear(Y); + return ret; +} + +int +build_curves_with_torsion_aux(ell_curve_t Eaux, ell_point_t Paux, + mpz_t A2, mpz_t A1div2, mpz_t x0, mpz_t y0, + mpz_t cte, + char *sa4, char *sa6, char *sPx, char *sPy, + char *sA2, char *sA1div2, char *sx0, char *sy0, + char *scte, mpmod_t n, mpres_t tmp) +{ + mpz_t f; + + mpz_init(f); + mod_from_rat_str(f, sa4, n->orig_modulus); + mpres_set_z(tmp, f, n); + ell_curve_init_set(Eaux, ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE, tmp, n); + mod_from_rat_str(f, sa6, n->orig_modulus); + mpres_set_z(Eaux->a6, f, n); + ell_point_init(Paux, Eaux, n); + mod_from_rat_str(f, sPx, n->orig_modulus); + mpres_set_z(Paux->x, f, n); + mod_from_rat_str(f, sPy, n->orig_modulus); + mpres_set_z(Paux->y, f, n); +#if DEBUG_TORSION >= 2 + printf("Paux:="); + pt_print(Eaux, Paux, n); + printf(";\n"); +#endif + mod_from_rat_str(A2, sA2, n->orig_modulus); + mod_from_rat_str(A1div2, sA1div2, n->orig_modulus); + mod_from_rat_str(x0, sx0, n->orig_modulus); + mod_from_rat_str(y0, sy0, n->orig_modulus); + mod_from_rat_str(cte, scte, n->orig_modulus); + mpz_clear(f); + return 1; +} + +/* + SIDE EFFECT: tE[0..nE[ and tP[0..nE[ receive a curve of torsion Z7 + and a point on it using parameters [umin..umax[. + OUTPUT: ECM_NO_FACTOR_FOUND or ECM_FACTOR_FOUND_STEP1 if a factor is found. + tE[i], tP[i] are built in raw modular form, not Montgomery form. + REM: we assume gcd(n, 6). +*/ +int +build_curves_with_torsion_Z7(mpz_t fac, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE) +{ + int u, ret = ECM_NO_FACTOR_FOUND, nc = 0; + mpz_t A2, A1div2, x0, y0, cte, d, c, b, kx0, ky0, A, B, X, Y; + mpres_t tmp; + ell_curve_t E; + ell_point_t P, Q; + + mpz_init(A2); + mpz_init(A1div2); + mpz_init(cte); + mpz_init(x0); + mpz_init(y0); + mpz_init(A); + mpz_init(B); + mpz_init(X); + mpz_init(Y); + /* Eaux = "1295/48", "-1079/864" */ + /* Paux = "2185/12", "-2458" */ + /* Y^2 = X^4-1/2*X^2-8*X-1727/16 */ + mpres_init(tmp, n); + build_curves_with_torsion_aux(E, P, A2, A1div2, x0, y0, cte, + "1295/48", "-1079/864", + "2185/12", "-2458", + "1/12", "-1", + "-1", "8", "-7/2", + n, tmp); + mpz_init(d); + mpz_init(c); + mpz_init(b); + mpz_init(kx0); + mpz_init(ky0); + ell_point_init(Q, E, n); + mpz_set_si(d, umin-1); + if(ell_point_mul(fac, Q, d, P, E, n) == 0){ + printf("found factor during init of Q in Z7\n"); + ret = ECM_FACTOR_FOUND_STEP1; + } + for(u = umin; (ret != ECM_FACTOR_FOUND_STEP1) && u < umax; u++){ + /* update Q */ + if(ell_point_add(fac, Q, P, Q, E, n) == 0){ + printf("found factor during update of Q in Z7\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + if(ell_point_is_on_curve(Q, E, n) == 0){ + printf("#!# Q=[%d]P is not on E\n", u); + // ell_point_print(Q, E, n); printf("\n"); + ret = ECM_ERROR; + break; + } + /* come back to plain (not Montgomery) residues */ + mpres_get_z(b, Q->x, n); + mpres_get_z(c, Q->y, n); +#if DEBUG_TORSION >= 2 + gmp_printf("b:=%Zd; c:=%Zd;\n", b, c); + printf("(s, t)[%d]:=", u); + pt_print(E, Q, n); + printf(";\n"); +#endif + if(cubic_to_quartic(fac, n->orig_modulus, d, ky0, b, c, + A2, A1div2, x0, y0, cte) == 0){ + printf("found factor in Z7 (cubic_to_quartic)\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* (d, ky0) is a point on y^2 = x^4-18*x^3+13*x^2-28*x+4 */ + /* d:=x; */ + /* x0:=-2*d; */ + mpz_mul_si(kx0, d, -2); + mpz_mod(kx0, kx0, n->orig_modulus); + /* y0:=d*y/2; */ + mpz_mul(ky0, ky0, d); + mpz_mod(ky0, ky0, n->orig_modulus); + mod_div_2(ky0, n->orig_modulus); + /* c:=d^2-d; */ + mpz_mul(c, d, d); + mpz_sub(c, c, d); + mpz_mod(c, c, n->orig_modulus); + /* b:=c*d; */ + mpz_mul(b, c, d); + mpz_mod(b, b, n->orig_modulus); + /* to short Weierstrass form */ + kubert_to_weierstrass(A, B, X, Y, b, c, kx0, ky0, n->orig_modulus); + if(check_weierstrass(A, B, X, Y, tmp, x0, n->orig_modulus) == 0){ + ret = ECM_ERROR; + break; + } + ell_curve_init(tE[nc], ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE,n); + mpz_set(tE[nc]->a4, A); + mpz_set(tE[nc]->a6, B); + ell_point_init(tP[nc], tE[nc], n); + mpz_set(tP[nc]->x, X); + mpz_set(tP[nc]->y, Y); +#if DEBUG_TORSION >= 2 + gmp_printf("E[%d]:=[%Zd];\n", nc, tE[nc]->a4); + gmp_printf("P[%d]:=[%Zd, %Zd, %Zd];\n", + nc, tP[nc]->x, tP[nc]->y, tP[nc]->z); +#endif + nc++; + if(nc >= nE) + break; + } + mpz_clear(A2); + mpz_clear(A1div2); + mpz_clear(x0); + mpz_clear(y0); + mpz_clear(cte); + mpz_clear(A); + mpz_clear(B); + mpz_clear(X); + mpz_clear(Y); + ell_point_clear(P, E, n); + ell_point_clear(Q, E, n); + ell_curve_clear(E, n); + mpz_clear(d); + mpz_clear(c); + mpz_clear(b); + mpz_clear(kx0); + mpz_clear(ky0); + mpres_clear(tmp, n); + return ret; +} + +/* + SIDE EFFECT: tE[0..nE[ and tP[0..nE[ receive a curve of torsion Z9 + and a point on it using parameters [umin..umax[. + OUTPUT: ECM_NO_FACTOR_FOUND or ECM_FACTOR_FOUND_STEP1 if a factor is found. + tE[i], tP[i] are built in raw modular form, not Montgomery form. + REM: we assume gcd(n, 6). +*/ +int +build_curves_with_torsion_Z9(mpz_t fac, mpmod_t n, ell_curve_t *tE, + ell_point_t *tP, int umin, int umax, int nE) +{ + int u, ret = ECM_NO_FACTOR_FOUND, nc = 0; + mpz_t A2, A1div2, x0, y0, cte, d, c, b, kx0, ky0, A, B, X, Y, f; + mpres_t tmp; + ell_curve_t E; + ell_point_t P, Q; + + mpz_init(A2); + mpz_init(A1div2); + mpz_init(cte); + mpz_init(x0); + mpz_init(y0); + mpz_init(A); + mpz_init(B); + mpz_init(X); + mpz_init(Y); + /* Eaux = [-9, 9] */ + /* Paux = [1, 1, 1] */ + /* Y^2 = X^4-24*X-36 */ + mpres_init(tmp, n); + build_curves_with_torsion_aux(E, P, A2, A1div2, x0, y0, cte, + "-9", "9", "1", "1", + "0", "3", + "2", "3", "0", + n, tmp); + mpz_init(f); + mpz_init(d); + mpz_init(c); + mpz_init(b); + mpz_init(kx0); + mpz_init(ky0); + ell_point_init(Q, E, n); + mpz_set_si(d, umin-1); + if(ell_point_mul(fac, Q, d, P, E, n) == 0){ + printf("found factor during init of Q in Z9\n"); + ret = ECM_FACTOR_FOUND_STEP1; + } + for(u = umin; (ret != ECM_FACTOR_FOUND_STEP1) && u < umax; u++){ + /* update Q */ + if(ell_point_add(fac, Q, P, Q, E, n) == 0){ + printf("found factor during update of Q in Z9\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } +#if DEBUG_TORSION >= 2 + printf("(s, t)[%d]:=", u); + pt_print(E, Q, n); + printf(";\n"); +#endif + if(ell_point_is_on_curve(Q, E, n) == 0){ + printf("#!# Q=[%d]P is not on E\n", u); + ret = ECM_ERROR; + break; + } + mpres_get_z(b, Q->x, n); + mpres_get_z(c, Q->y, n); + if(cubic_to_quartic(fac, n->orig_modulus, f, ky0, b, c, + A2, A1div2, x0, y0, cte) == 0){ + printf("found factor in Z9 (cubic_2_quartic)\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* f:=x; */ + /* d:=f*(f-1)+1; */ + mpz_sub_si(d, f, 1); + mpz_mul(d, d, f); + mpz_add_si(d, d, 1); + mpz_mod(d, d, n->orig_modulus); + /* c:=f*(d-1); */ + mpz_sub_si(c, d, 1); + mpz_mul(c, c, f); + mpz_mod(c, c, n->orig_modulus); + /* kx0:=(2*f-1)*f^2; */ + /** b <- f^2 **/ + mpz_mul(b, f, f); + mpz_mod(b, b, n->orig_modulus); + mpz_mul_si(kx0, f, 2); + mpz_sub_si(kx0, kx0, 1); + mpz_mul(kx0, kx0, b); + mpz_mod(kx0, kx0, n->orig_modulus); + /* ky0:=y*f^4/2; */ + /** b <- b^2 = f^4 **/ + mpz_mul(b, b, b); + mpz_mul(ky0, ky0, b); + mpz_mod(ky0, ky0, n->orig_modulus); + mod_div_2(ky0, n->orig_modulus); + /* b:=c*d; */ + mpz_mul(b, c, d); + mpz_mod(b, b, n->orig_modulus); +#if DEBUG_TORSION >= 2 + gmp_printf("f=%Zd d=%Zd c=%Zd b=%Zd\n", f, d, c, b); + gmp_printf("kx0=%Zd ky0=%Zd\n", kx0, ky0); +#endif + /* to short Weierstrass form */ + kubert_to_weierstrass(A, B, X, Y, b, c, kx0, ky0, n->orig_modulus); + if(check_weierstrass(A, B, X, Y, tmp, x0, n->orig_modulus) == 0){ + ret = ECM_ERROR; + break; + } + ell_curve_init(tE[nc], ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE, n); + mpz_set(tE[nc]->a4, A); + mpz_set(tE[nc]->a6, B); + ell_point_init(tP[nc], tE[nc], n); + mpz_set(tP[nc]->x, X); + mpz_set(tP[nc]->y, Y); + nc++; + if(nc >= nE) + break; + } + mpz_clear(A); + mpz_clear(B); + mpz_clear(X); + mpz_clear(Y); + mpz_clear(A2); + mpz_clear(A1div2); + mpz_clear(x0); + mpz_clear(y0); + mpz_clear(cte); + ell_point_clear(P, E, n); + ell_point_clear(Q, E, n); + mpz_clear(f); + mpz_clear(d); + mpz_clear(c); + mpz_clear(b); + mpz_clear(kx0); + mpz_clear(ky0); + mpres_clear(tmp, n); + return ret; +} + +int +build_curves_with_torsion_Z10(mpz_t fac, mpmod_t n, ell_curve_t *tE, + ell_point_t *tP, int umin, int umax, int nE) +{ + int u, ret = ECM_NO_FACTOR_FOUND, nc = 0; + mpz_t A2, A1div2, x0, y0, cte, d, c, b, kx0, ky0, A, B, X, Y; + mpz_t f; + mpres_t tmp; + ell_curve_t E; + ell_point_t P, Q; + + mpz_init(A2); + mpz_init(A1div2); + mpz_init(cte); + mpz_init(x0); + mpz_init(y0); + mpz_init(A); + mpz_init(B); + mpz_init(X); + mpz_init(Y); + /* Eaux = [2/3, -53/108] */ + /* Paux = [2/3, 1/2, 1] */ + /* Y^2 = X^4-4*X^2-4*X-4 */ + mpres_init(tmp, n); + build_curves_with_torsion_aux(E, P, A2, A1div2, x0, y0, cte, + "2/3", "-53/108", "2/3", "1/2", + "2/3", "-1/2", + "0", "1", "-2", + n, tmp); + mpz_init(f); + mpz_init(d); + mpz_init(c); + mpz_init(b); + mpz_init(kx0); + mpz_init(ky0); + ell_point_init(Q, E, n); + for(u = umin; u < umax; u++){ + if(forbidden("Z10", u)) + continue; + /* update Qaux */ + mpz_set_si(d, u); + if(ell_point_mul(fac, Q, d, P, E, n) == 0){ + printf("found factor during update of Q in Z10\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } +#if DEBUG_TORSION >= 2 + printf("(s, t)[%d]:=", u); + pt_print(E, Q, n); + printf(";\n"); +#endif + if(ell_point_is_on_curve(Q, E, n) == 0){ + printf("#!# Q=[%d]P is not on E\n", u); + ret = ECM_ERROR; + break; + } + mpres_get_z(b, Q->x, n); + mpres_get_z(c, Q->y, n); + if(cubic_to_quartic(fac, n->orig_modulus, f, ky0, b, c, + A2, A1div2, x0, y0, cte) == 0){ + printf("found factor in Z10 (cubic_2_quartic)\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* f:=x; */ + /* d:=f^2/(f-(f-1)^2) = -f^2/(f^2-3*f+1) */ + /** b <- -f^2 **/ + mpz_mul(b, f, f); + mpz_neg(b, b); + mpz_mod(b, b, n->orig_modulus); + /* c = f^2-3*f+1 = f*(f-3)+1 */ + mpz_sub_si(c, f, 3); + mpz_mul(c, c, f); + mpz_add_si(c, c, 1); + mpz_mod(c, c, n->orig_modulus); + if(mod_from_rat2(d, b, c, n->orig_modulus) == 0){ + printf("inverse found in Z10 (d)\n"); + mpz_set(fac, d); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* c:=f*(d-1); */ + mpz_sub_si(c, d, 1); + mpz_mul(c, c, f); + mpz_mod(c, c, n->orig_modulus); + /* ky0:=y*f^4/(f^2-3*f+1)^2/2; = num/den */ + /* it seems that ky0 = y*d^2/2 */ + mpz_mul(b, ky0, d); + mpz_mul(b, b, d); + mpz_mod(b, b, n->orig_modulus); + mpz_set_si(fac, 2); + mod_from_rat2(ky0, b, fac, n->orig_modulus); + /* kx0:=-f*d; */ + mpz_mul(kx0, f, d); + mpz_neg(kx0, kx0); + mpz_mod(kx0, kx0, n->orig_modulus); + /* b:=c*d; */ + mpz_mul(b, c, d); + mpz_mod(b, b, n->orig_modulus); +#if DEBUG_TORSION >= 2 + gmp_printf("f:=%Zd; d:=%Zd; c:=%Zd; b:=%Zd;\n", f, d, c, b); + gmp_printf("kx0:=%Zd; ky0:=%Zd;\n", kx0, ky0); +#endif + /* to short Weierstrass form */ + kubert_to_weierstrass(A, B, X, Y, b, c, kx0, ky0, n->orig_modulus); + if(check_weierstrass(A, B, X, Y, tmp, x0, n->orig_modulus) == 0){ + ret = ECM_ERROR; + break; + } + ell_curve_init(tE[nc], ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE, n); + mpz_set(tE[nc]->a4, A); + mpz_set(tE[nc]->a6, B); + ell_point_init(tP[nc], tE[nc], n); + mpz_set(tP[nc]->x, X); + mpz_set(tP[nc]->y, Y); + nc++; + if(nc >= nE) + break; + } +#if DEBUG_TORSION >= 2 + if(ret != ECM_ERROR && nc > 0){ + printf("Curves built\n"); + pt_many_print(tE, tP, nE, n); + } +#endif + mpz_clear(A); + mpz_clear(B); + mpz_clear(X); + mpz_clear(Y); + mpz_clear(A2); + mpz_clear(A1div2); + mpz_clear(x0); + mpz_clear(y0); + mpz_clear(cte); + ell_point_clear(P, E, n); + ell_point_clear(Q, E, n); + ell_curve_clear(E, n); + mpres_clear(tmp, n); + mpz_clear(d); + mpz_clear(c); + mpz_clear(b); + mpz_clear(kx0); + mpz_clear(ky0); + mpz_clear(f); + return ret; +} + +/* Warning: b and a have the Montgomery meaning in this function. + All tE[i] will be in Montgomery form: B*Y^2 = X^3 + A * X^2 + X. +*/ +int +build_curves_with_torsion_Z2xZ8(mpz_t fac, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE) +{ + int u, nc = 0, ret = ECM_NO_FACTOR_FOUND; + mpz_t tmp, a, b, alpha, beta, c, d, kx0, ky0, wx0, mb; + mpres_t tmp2; + ell_curve_t E; + ell_point_t P, Q; + + mpz_init(alpha); + mpz_init(beta); + mpz_init(tmp); + mpz_init(a); + mpz_init(b); + mpz_init(c); + mpz_init(d); + mpz_init(kx0); + mpz_init(ky0); + mpz_init(wx0); + mpz_init(mb); + + /* Eaux = [-8, -32] */ + /* Paux = [12, 40, 1] */ + mpres_init(tmp2, n); + mpz_set_str(fac, "-8", 10); + mpres_set_z(tmp2, fac, n); + ell_curve_init_set(E, ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE, tmp2, n); + ell_point_init(P, E, n); + mpz_set_str(fac, "12", 10); + mpres_set_z(P->x, fac, n); + mpz_set_str(fac, "40", 10); + mpres_set_z(P->y, fac, n); + mpz_set_ui(P->z, 1); + + ell_point_init(Q, E, n); + mpz_set_si(d, umin-1); + if(ell_point_mul(fac, Q, d, P, E, n) == 0){ + printf("found factor during init of Q in Z2xZ8\n"); + ret = ECM_FACTOR_FOUND_STEP1; + } + for(u = umin; (ret != ECM_FACTOR_FOUND_STEP1) && u < umax; u++){ + /* update Q */ + if(ell_point_add(fac, Q, P, Q, E, n) == 0){ + printf("found factor during update of Q in Z2xZ8\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } +#if DEBUG_TORSION >= 2 + printf("(s, t)[%d]:=", u); + pt_print(E, Q, n); + printf(";\n"); +#endif + /* beta <- (y+25)/(x-9) */ + mpres_get_z(a, Q->x, n); + mpres_get_z(b, Q->y, n); + mpz_mod(wx0, a, n->orig_modulus); + mpz_sub_si(a, a, 9); + mpz_mod(a, a, n->orig_modulus); + mpz_add_si(b, b, 25); + mpz_mod(b, b, n->orig_modulus); + if(mod_from_rat2(beta, b, a, n->orig_modulus) == 0){ + printf("found factor in Z2xZ8 (beta)\n"); + mpz_set(fac, beta); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + mpz_add_si(tmp, beta, 1); + mpz_mod(tmp, tmp, n->orig_modulus); + /* alpha <- 1/(beta+1) */ + if(mpz_invert(alpha, tmp, n->orig_modulus) == 0){ + printf("found factor in Z2xZ8 (alpha)\n"); + mpz_gcd(fac, tmp, n->orig_modulus); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /** d <- 8*alpha^2-1; + d = -(beta^2+2*beta-7)/(beta+1)^2 + **/ + mpz_mul(d, alpha, alpha); + mpz_mul_si(d, d, 8); + mpz_sub_si(d, d, 1); + mpz_mod(d, d, n->orig_modulus); + /* d:=2*alpha*(4*alpha+1)/d; */ + mpz_mul_si(c, alpha, 4); + mpz_add_si(c, c, 1); + mpz_mul(c, c, alpha); + mpz_mul_si(c, c, 2); + mpz_mod(c, c, n->orig_modulus); + if(mod_from_rat2(fac, c, d, n->orig_modulus) == 0){ + // the only possibility is d = 0 mod p or 8*alpha^2-1 = 0 mod p + printf("found factor in Z2xZ8 (d)\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + mpz_set(d, fac); + /* c:=(2*d-1)*(d-1)/d;*/ + mpz_sub_si(fac, d, 1); + /** kx0 <- 2*d-1 **/ + mpz_mul_si(kx0, d, 2); + mpz_sub_si(kx0, kx0, 1); + mpz_mul(fac, fac, kx0); + mpz_mod(fac, fac, n->orig_modulus); + if(mod_from_rat2(c, fac, d, n->orig_modulus) == 0){ + // this is possible only if d = 0 mod p or + // 2*alpha*(4*alpha+1) = 0 mod p + printf("found factor in Z2xZ8 (d2)\n"); + mpz_set(fac, c); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* b = c*d */ + mpz_mul(b, c, d); + mpz_mod(b, b, n->orig_modulus); + /* kx0:=-(2*d-1)/4;*/ + mod_div_2(kx0, n->orig_modulus); + mod_div_2(kx0, n->orig_modulus); + mpz_mul_si(kx0, kx0, -1); + mpz_mod(kx0, kx0, n->orig_modulus); + /* ky0:=(c/8)*(-beta^2+2*uP[1]+9); */ + mpz_mul(fac, beta, beta); + mpz_set(a, wx0); + mpz_sub(fac, a, fac); + mpz_add(fac, fac, a); + mpz_add_si(fac, fac, 9); + mpz_mul(fac, fac, c); + mpz_mod(fac, fac, n->orig_modulus); + mod_div_2(fac, n->orig_modulus); + mod_div_2(fac, n->orig_modulus); + mod_div_2(fac, n->orig_modulus); + /* ky0:=ky0/(beta^2+2*beta-7); */ + mpz_add_si(tmp, beta, 2); + mpz_mul(tmp, tmp, beta); + mpz_sub_si(tmp, tmp, 7); + mpz_mod(tmp, tmp, n->orig_modulus); + /* as proven above, we cannot have tmp non invertible at that point */ + mod_from_rat2(ky0, fac, tmp, n->orig_modulus); + KW2W246(fac, a, NULL, b, c, n->orig_modulus, 0); +#if DEBUG_TORSION >= 2 + gmp_printf("kwx0:=%Zd;\n", kx0); + gmp_printf("kwy0:=%Zd;\n", ky0); + printf("(kwy0^2-(kwx0^3+a2*kwx0^2+a4*kwx0+a6)) mod N;\n"); +#endif + /* wx0:=kx0+a2/3; */ + mpz_set_si(tmp, 3); + mod_from_rat2(wx0, fac, tmp, n->orig_modulus); + mpz_add(wx0, wx0, kx0); + mpz_mod(wx0, wx0, n->orig_modulus); + /* mb:=-1/(d-1)^2; */ + mpz_sub_si(tmp, d, 1); + mpz_mul(tmp, tmp, tmp); + mpz_mod(tmp, tmp, n->orig_modulus); + mpz_neg(tmp, tmp); + if(mpz_invert(mb, tmp, n->orig_modulus) == 0){ + printf("found factor in Z2xZ8 (mb)\n"); + mpz_gcd(fac, tmp, n->orig_modulus); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* ma:=-1/4*(8*d^4-16*d^3+16*d^2-8*d+1)/(d-1)^2/d^2; + :=mb*(8*d^4-16*d^3+16*d^2-8*d+1)/(4*d^2) + */ + mpz_set_si(fac, 8); /* num */ + mpz_mul(fac, fac, d); mpz_add_si(fac, fac, -16); + mpz_mul(fac, fac, d); mpz_add_si(fac, fac, 16); + mpz_mul(fac, fac, d); mpz_add_si(fac, fac, -8); + mpz_mul(fac, fac, d); mpz_add_si(fac, fac, 1); +#if 0 + mpz_sub_si(tmp, d, 1); /* den */ + mpz_mul(tmp, tmp, d); + mpz_mul(tmp, tmp, tmp); + mpz_mul_si(tmp, tmp, -4); + mpz_mod(tmp, tmp, n->orig_modulus); +#else + mpz_mul(fac, fac, mb); + /* one day, we could save 1/d computation again */ + mpz_mul(tmp, d, d); + mpz_mul_si(tmp, tmp, 4); +#endif + /* to Montgomery form */ + ell_curve_init(tE[nc], ECM_EC_TYPE_MONTGOMERY, ECM_LAW_HOMOGENEOUS,n); + ell_point_init(tP[nc], tE[nc], n); + /* this cannot yield a factor, since d is invertible at that point */ + mod_from_rat2(tE[nc]->a2, fac, tmp, n->orig_modulus); + /* not really needed, but useful for debug */ + mpz_set_ui(tE[nc]->a4, 1); + mpz_set_ui(tE[nc]->a6, 0); + /* mx:=mb*wx0-ma/3; */ + mpz_mul(fac, mb, wx0); + mpz_set_si(tmp, 3); + mod_from_rat2(tP[nc]->x, tE[nc]->a2, tmp, n->orig_modulus); + mpz_sub(tP[nc]->x, fac, tP[nc]->x); + mpz_mod(tP[nc]->x, tP[nc]->x, n->orig_modulus); + /* my:=mb*ky0; */ +#if DEBUG_TORSION >= 2 + gmp_printf("N:=%Zd;\n", n->orig_modulus); + gmp_printf("ma:=%Zd;\n", tE[nc]->a2); + gmp_printf("mb:=%Zd;\n", mb); + gmp_printf("kx0:=%Zd;\n", kx0); + gmp_printf("ky0:=%Zd;\n", ky0); + gmp_printf("mx0:=%Zd;\n", tP[nc]->x); + mpz_mul(tmp, mb, ky0); + mpz_mod(tmp, tmp, n->orig_modulus); + gmp_printf("my0:=%Zd;\n", tmp); + printf("chk:=(mb*my0^2-mx0^3-ma*mx0^2-mx0) mod N;\n"); +#endif + nc++; + if(nc >= nE) + break; + } +#if DEBUG_TORSION >= 2 + printf("Curves built\n"); + pt_many_print(tE, tP, nE, n); +#endif + ell_point_clear(P, E, n); + ell_point_clear(Q, E, n); + ell_curve_clear(E, n); + mpz_clear(mb); + mpz_clear(tmp); + mpz_clear(a); + mpz_clear(b); + mpz_clear(c); + mpz_clear(d); + mpz_clear(alpha); + mpz_clear(beta); + mpz_init(kx0); + mpz_init(ky0); + mpz_init(wx0); + mpres_clear(tmp2, n); + return ret; +} + +/* Z3xZ3 over Q(sqrt(-3)). Interesting if we know that p | N is s.t. + p = 1 mod 3. + Source: Dujella and Najman, arxiv:1201.0266v1 + A more simpler and more efficient stuff, using Hessian form. */ +int +build_curves_with_torsion_Z3xZ3(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE) +{ + int u, nc = 0, ret = ECM_NO_FACTOR_FOUND; + mpz_t u0, v0, D, num, den; + + mpz_init(u0); + mpz_init(num); + mpz_init(den); + mpz_init(D); + mpz_init_set_si(v0, umin-1); /* to prevent u0 = v0 */ + for(u = umin; u < umax; u++){ + if(forbidden("Z3xZ3", u)) + continue; + mpz_set_si(u0, u); + /* D:=RatMod((u0^3+v0^3+1)/(3*u0*v0), N); */ + mpz_mul(num, u0, u0); + mpz_mul(num, num, u0); + mpz_mul(den, v0, v0); + mpz_mul(den, den, v0); + mpz_add(num, num, den); + mpz_add_si(num, num, 1); + mpz_mod(num, num, n->orig_modulus); + if(mpz_sgn(num) == 0) + continue; + + mpz_mul(den, u0, v0); + mpz_mul_si(den, den, 3); + mpz_mod(den, den, n->orig_modulus); + + if(mod_from_rat2(D, num, den, n->orig_modulus) == 0){ + printf("found factor in Z3xZ3 (D)\n"); + mpz_set(f, D); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + mpz_mul(num, D, D); + mpz_mul(num, num, D); + mpz_mod(num, num, n->orig_modulus); + if(mpz_cmp_ui(num, 1) == 0){ + printf("D^3=1 => singluar curve\n"); + ret = ECM_ERROR; + break; + } + ell_curve_init_set(tE[nc],ECM_EC_TYPE_HESSIAN,ECM_LAW_HOMOGENEOUS,D,n); + ell_point_init(tP[nc], tE[nc], n); + mpz_set(tP[nc]->x, u0); + mpz_set(tP[nc]->y, v0); + mpz_set_ui(tP[nc]->z, 1); + nc++; + if(nc >= nE) + break; + } + mpz_clear(u0); + mpz_clear(v0); + mpz_clear(D); + mpz_clear(num); + mpz_clear(den); + return ret; +} + +/* For a small price, add a 2-torsion point, also over Q(sqrt(-3)). */ +int +build_curves_with_torsion_Z3xZ6(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE) +{ + int u, nc = 0, ret = ECM_NO_FACTOR_FOUND; + ell_curve_t E; + ell_point_t P, Q; + mpres_t tmp, num, den, tk, sk; + mpz_t t; + + mpz_init(t); + mpz_init(num); + mpz_init(den); + mpz_init(tk); + mpz_init(sk); + /* Eaux:=EllipticCurve([0, -4]); */ + /* Paux:=Eaux![2, 2, 1]; */ + mpres_init(tmp, n); + mpres_set_ui(tmp, 0, n); + ell_curve_init_set(E, ECM_EC_TYPE_WEIERSTRASS, ECM_LAW_AFFINE, tmp, n); + ell_point_init(P, E, n); + mpz_set_str(f, "2", 10); + mpres_set_z(P->x, f, n); + mpz_set_str(f, "2", 10); + mpres_set_z(P->y, f, n); + mpz_set_ui(P->z, 1); + + ell_point_init(Q, E, n); + for(u = umin; u < umax; u++){ + /* update Qaux */ + mpz_set_si(f, u); + if(ell_point_mul(f, Q, f, P, E, n) == 0){ + printf("found factor in Z3xZ6 (update of Q)\n"); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } +#if DEBUG_TORSION >= 2 + printf("(s, t)[%d]:=", u); + pt_print(E, Q, n); + printf(";\n"); +#endif + mpres_get_z(tk, Q->x, n); + mpres_get_z(sk, Q->y, n); +#if 0 /* useless in affine form? */ + mpres_get_z(t, Q->z, n); + if(mpz_invert(f, t, n->orig_modulus) == 0){ + printf("found factor in Z3xZ6 (normalization)\n"); + mpz_gcd(f, t, n->orig_modulus); + break; + } + mpz_mul(tk, tk, f); + mpz_mod(tk, tk, n->orig_modulus); + mpz_mul(sk, sk, f); + mpz_mod(sk, sk, n->orig_modulus); +#endif + /* t:=RatMod(-tk/2, N); */ + mpz_mul_si(t, tk, -1); + mod_div_2(t, n->orig_modulus); + /* D:=RatMod((2*t^3+1)/3/t^2, N); */ + mpz_mul(den, t, t); + mpz_mod(den, den, n->orig_modulus); + mpz_mul(num, den, t); + mpz_mul_si(num, num, 2); + mpz_add_si(num, num, 1); + mpz_mod(num, num, n->orig_modulus); + mpz_mul_si(den, den, 3); + mpz_mod(den, den, n->orig_modulus); + ell_curve_init(tE[nc], ECM_EC_TYPE_HESSIAN, ECM_LAW_HOMOGENEOUS, n); + ell_point_init(tP[nc], tE[nc], n); + if(mod_from_rat2(tE[nc]->a4, num, den, n->orig_modulus) == 0){ + /* only if t = 0, which seems hard */ + printf("found factor in Z3xZ6 (D)\n"); + mpz_set(f, tE[nc]->a4); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } +#if DEBUG_TORSION >= 1 + gmp_printf("D%d:=%Zd;\n", nc, tE[nc]->a4); +#endif + /* u0:=RatMod(sk/tk, N); + if tk was not invertible, it would have been caught before + */ + mod_from_rat2(tP[nc]->x, sk, tk, n->orig_modulus); + /* v0:=-1; */ + mpz_sub_si(tP[nc]->y, n->orig_modulus, 1); + mpz_set_ui(tP[nc]->z, 1); + nc++; + if(nc >= nE) + break; + } + mpz_clear(t); + mpz_clear(num); + mpz_clear(den); + mpz_clear(sk); + mpz_clear(tk); + mpres_clear(tmp, n); + return ret; +} + +/* JKL: K = Q(sqrt(-3), sqrt(8*t^3+1), t in Q, t != 0, 1, -1/2; + mu = (2*t^3+1)/(3*t^2) => parameter for Hessian form. + Tors(E) = Z/6xZ/6. + A "specified" point is (0:-1:1), but does it have infinite order? + Also: twisted Hessian is a*X^3+Y^3+Z^3=d*X*Y*Z, d/a=3*mu. + See JKL-ECM in ANTS-XII. + */ + +/* Original source is Brier + Clavier. + We can build curves in Montgomery form directly... + Useful if one knows that all p | n are 1 mod 4 (Cunningham, etc.). +*/ +int +build_curves_with_torsion_Z4xZ4(mpz_t f, mpmod_t n, ell_curve_t *tE, + ell_point_t *tP, + int smin, int smax, int nE) +{ + mpz_t tau, lambda, nu2, tmp, b, x0; + int nu, nc = 0, ret = ECM_NO_FACTOR_FOUND; + + mpz_init(tau); + mpz_init(lambda); + mpz_init(nu2); + mpz_init(tmp); + mpz_init(b); + mpz_init(x0); + for(nu = smin; nu < smax; nu++){ + mpz_set_si(nu2, nu*nu); + /* tau:=(nu^2+3)/2/nu; */ + mpz_add_si(lambda, nu2, 3); + mpz_set_si(tmp, 2*nu); + if(mod_from_rat2(tau, lambda, tmp, n->orig_modulus) == 0){ + printf("Factor found during init of Z4xZ4 (tau)\n"); + mpz_set(f, tau); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* lambda:=8*nu^3; */ + mpz_mul_si(lambda, nu2, 8*nu); + mpz_mod(lambda, lambda, n->orig_modulus); + /* A:=-27*lambda^4*(tau^8+14*tau^4+1); */ + /* B:=54*lambda^6*(tau^12-33*tau^8-33*tau^4+1); */ + /* x0:=3*(3*nu^12+34*nu^10+117*nu^8+316*nu^6+1053*nu^4+2754*nu^2+2187); */ + /* y0:=27*(nu^2-3)*(nu^2+1)*(nu^2+9)*(nu^6+5*nu^4+15*nu^2+27)^2; */ + /* P = (x0, y0) is a point on Y^2 = X^3+A*X+B */ + + /* Montgomery form: there are several possible mb */ + /* mb:=1/(9*lambda^2*(tau^4-1)); + lambda is invertible iff nu is; + tau^4-1 = (tau-1)(tau+1)(tau^2+1) + */ + mpz_powm_ui(x0, tau, 4, n->orig_modulus); + mpz_sub_si(x0, x0, 1); + mpz_mod(x0, x0, n->orig_modulus); + mpz_mul(tmp, x0, lambda); + mpz_mul(tmp, tmp, lambda); + mpz_mul_si(tmp, tmp, 9); + if(mpz_invert(b, tmp, n->orig_modulus) == 0){ + printf("Factor found during init of Z4xZ4 (mb)\n"); + mpz_gcd(f, tmp, n->orig_modulus); + ret = ECM_FACTOR_FOUND_STEP1; + break; + } + /* ma:=-2*(tau^4+1)/(tau^4-1); at this point: invertible! */ + mpz_add_si(tmp, x0, 2); + mpz_mul_si(tmp, tmp, -2); + mpz_mod(tmp, tmp, n->orig_modulus); + /* to Montgomery form */ + ell_curve_init(tE[nc], ECM_EC_TYPE_MONTGOMERY, ECM_LAW_HOMOGENEOUS, n); + ell_point_init(tP[nc], tE[nc], n); + mod_from_rat2(tE[nc]->a4, tmp, x0, n->orig_modulus); + /* now compute real x0 */ + /* x0:=3*(3*nu^12+34*nu^10+117*nu^8+316*nu^6+1053*nu^4+2754*nu^2+2187); */ + mpz_set_si(x0, 3); + mpz_mul(x0, x0, nu2); mpz_add_si(x0, x0, 34); + mpz_mul(x0, x0, nu2); mpz_add_si(x0, x0, 117); + mpz_mul(x0, x0, nu2); mpz_add_si(x0, x0, 316); + mpz_mul(x0, x0, nu2); mpz_add_si(x0, x0, 1053); + mpz_mul(x0, x0, nu2); mpz_add_si(x0, x0, 2754); + mpz_mul(x0, x0, nu2); mpz_add_si(x0, x0, 2187); + mpz_mul_si(x0, x0, 3); + mpz_mod(x0, x0, n->orig_modulus); +#if DEBUG_TORSION >= 2 + gmp_printf("N:=%Zd;\n", n); + printf("nu:=%d;\n", nu); + gmp_printf("tau:=%Zd;\n", tau); + gmp_printf("lambda:=%Zd;\n", lambda); + gmp_printf("a:=%Zd;\n", tE[nc]->a4); + gmp_printf("x0:=%Zd;\n", x0); +#endif + /* x:=b*x0-a/3; not needed: y:=b*y0 */ + mpz_set_si(tmp, 3); + mod_from_rat2(tP[nc]->x, tE[nc]->a4, tmp, n->orig_modulus); + mpz_mul(b, b, x0); + mpz_mod(b, b, n->orig_modulus); + mpz_sub(tP[nc]->x, b, tP[nc]->x); + mpz_mod(tP[nc]->x, tP[nc]->x, n->orig_modulus); + nc++; + if(nc >= nE) + break; + } + mpz_clear(tau); + mpz_clear(lambda); + mpz_clear(nu2); + mpz_clear(tmp); + mpz_clear(b); + mpz_clear(x0); + if(ret != ECM_FACTOR_FOUND_STEP1 && nc < nE){ + printf("Not enough curves generated\n"); + return ECM_ERROR; + } + return ret; +} + +/* Assuming we can generate curves with given torsion using parameter s + in interval [smin..smax[. +*/ +int +build_curves_with_torsion(mpz_t f, mpmod_t n, ell_curve_t *tE, ell_point_t *tP, + char *torsion, int smin, int smax, int nE) +{ + int ret = 0; + + /* over Q: see Atkin-Morain, Math. Comp., 1993 */ + if(strcmp(torsion, "Z5") == 0) + return build_curves_with_torsion_Z5(f, n, tE, tP, smin, smax, nE); + else if(strcmp(torsion, "Z7") == 0) + return build_curves_with_torsion_Z7(f, n, tE, tP, smin, smax, nE); + else if(strcmp(torsion, "Z9") == 0) + return build_curves_with_torsion_Z9(f, n, tE, tP, smin, smax, nE); + else if(strcmp(torsion, "Z10") == 0) + return build_curves_with_torsion_Z10(f, n, tE, tP, smin, smax, nE); + else if(strcmp(torsion, "Z2xZ8") == 0) + return build_curves_with_torsion_Z2xZ8(f, n, tE, tP, smin, smax, nE); + /* no longer over Q */ + /** interesting when p = 1 mod 3 **/ + else if(strcmp(torsion, "Z3xZ3") == 0) /* over Q(sqrt(-3)) */ + return build_curves_with_torsion_Z3xZ3(f, n, tE, tP, smin, smax, nE); + else if(strcmp(torsion, "Z3xZ6") == 0) /* over Q(sqrt(-3)) */ + return build_curves_with_torsion_Z3xZ6(f, n, tE, tP, smin, smax, nE); + /** interesting when p = 1 mod 4 **/ + else if(strcmp(torsion, "Z4xZ4") == 0) /* over Q(sqrt(-1)) */ + return build_curves_with_torsion_Z4xZ4(f, n, tE, tP, smin, smax, nE); + else{ + printf("Unknown torsion group: %s\n", torsion); + ret = ECM_ERROR; + } + return ret; +} + +/* E is a curve with given torsion and (x, y) a point on E mod n. + OUTPUT: ECM_NO_FACTOR_FOUND if everything went ok + ECM_FACTOR_FOUND_STEP1 in case a factor was found when building E. + REM: E is defined over Z, not in mpres_t. + */ +int +build_curves_with_torsion2(mpz_t f, mpz_t n, ell_curve_t E, + mpz_t x, mpz_t y, char *torsion, + mpz_t sigma) +{ + ell_curve_t tE[1]; + ell_point_t tP[1]; + mpmod_t modulus; + int ret, smin, smax; + + smin = (int)mpz_get_si(sigma); + smax = smin+10; + mpmod_init(modulus, n, ECM_MOD_DEFAULT); + ret = build_curves_with_torsion(f, modulus, tE, tP, torsion, smin, smax,1); + if(ret == ECM_NO_FACTOR_FOUND){ + E->type = tE[0]->type; + E->law = tE[0]->law; + mpz_set(E->a2, tE[0]->a2); + mpz_set(E->a4, tE[0]->a4); + mpz_set(E->a6, tE[0]->a6); + mpz_set(x, tP[0]->x); + mpz_set(y, tP[0]->y); + ell_point_clear(tP[0], tE[0], modulus); + ell_curve_clear(tE[0], modulus); + } + mpmod_clear(modulus); + return ret; +} diff -Nru gmp-ecm-7.0.4+ds/torsions.h gmp-ecm-7.0.5+ds/torsions.h --- gmp-ecm-7.0.4+ds/torsions.h 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/torsions.h 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,36 @@ +void mod_div_2(mpz_t x, mpz_t n); +int mod_from_rat(mpz_t r, mpq_t q, mpz_t N); +int mod_from_rat2(mpz_t r, mpz_t num, mpz_t den, mpz_t N); +void ec_force_point(ell_curve_t E, ell_point_t P, long *x0, mpz_t n); + +int +build_curves_with_torsion_Z5(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int smin, int smax, int nE); +int +build_curves_with_torsion_Z7(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE); +int +build_curves_with_torsion_Z9(mpz_t fac, mpmod_t n, ell_curve_t *tE, + ell_point_t *tP, int umin, int umax, int nE); +int +build_curves_with_torsion_Z10(mpz_t fac, mpmod_t n, ell_curve_t *tE, + ell_point_t *tP, int umin, int umax, int nE); +int +build_curves_with_torsion_Z2xZ8(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE); +int +build_curves_with_torsion_Z3xZ3_DuNa(mpmod_t n, ell_curve_t *tE, ell_point_t *tP, + int smin, int smax, int nE); +int +build_curves_with_torsion_Z3xZ3(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE); +int +build_curves_with_torsion_Z3xZ6(mpz_t f, mpmod_t n, + ell_curve_t *tE, ell_point_t *tP, + int umin, int umax, int nE); +int build_curves_with_torsion(mpz_t f, mpmod_t n, ell_curve_t *tE, ell_point_t *tP, char *torsion, int smin, int smax, int nE); +int build_curves_with_torsion2(mpz_t f, mpz_t n, ell_curve_t E, mpz_t x, mpz_t y, char *torsion, mpz_t sigma); diff -Nru gmp-ecm-7.0.4+ds/x86_64/autogen.py gmp-ecm-7.0.5+ds/x86_64/autogen.py --- gmp-ecm-7.0.4+ds/x86_64/autogen.py 2006-03-07 15:57:38.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/autogen.py 2022-06-06 14:16:49.000000000 +0000 @@ -287,23 +287,313 @@ GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') +ifdef(`WINDOWS64_ABI', +# stack: inv_m, %r9: m, %r8: y, %rdx: x, %rcx: *z +`define(`INV_M', `0x28(%rsp)') +define(`M', `%r9') +define(`Y', `%r8') +define(`X', `%rdx') +define(`Z', `%rcx') +define(`TMP2', `%r10') +define(`TMP1', `%r8')', +# %r8: inv_m, %rcx: m, %rdx: y, %rsi : x, %rdi : *z +`define(`INV_M', `%r8') +define(`M', `%rcx') +define(`Y', `%rdx') +define(`X', `%rsi') +define(`Z', `%rdi') +define(`TMP2', `%r10') +define(`TMP1', `%r9')') + GSYM_PREFIX`'mulredc1: -# %r8 : inv_m -# %rcx : m -# %rdx : y -# %rsi : x -# %rdi : z - movq %rdx, %rax - mulq %rsi - movq %rdx, %r10 - movq %rax, %r9 # store xy in [r9:r10] - mulq %r8 # compute u - mulq %rcx # compute u*m - addq %r9, %rax # rax is 0, now (carry is important) - adcq %r10, %rdx - movq %rdx, (%rdi) + movq Y, %rax + mulq X + movq %rdx, TMP2 + movq %rax, TMP1 # store xy in [r9:r10] + mulq INV_M # compute u + mulq M # compute u*m + addq TMP1, %rax # rax is 0, now (carry is important) + adcq TMP2, %rdx + movq %rdx, (Z) adcq $0, %rax ret + +ifdef(`WINDOWS64_ABI', +, +` +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +') dnl +""" +elif k == 2: + print """# +# mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, +# const mp_limb_t *m, mp_limb_t inv_m); +# +# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 +# Needs %rbx, %rsp, %rbp, %r12-%r15 restored +# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) +# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored + +# This stuff is run through M4 twice, first when generating the +# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) +# and again when generating the mulredc*.s files from the mulredc*.asm files +# when the user compiles the program. +# We used to substitute XP etc. by register names in the first pass, +# but now with switching between Linux and Windows ABI, we do it in +# the second pass instead when we know which ABI we have, as that +# allows us to assign registers differently for the two ABIs. +# That means that the defines for XP etc., need to be quoted once to be +# protected in the first M4 pass, so that they are processed and +# occurrences of XP etc. happen only in the second pass. + + + +include(`config.m4') + + TEXT +.p2align 6 # x86_64 L1 code cache line is 64 bytes long + GLOBL GSYM_PREFIX`'mulredc2 + TYPE(GSYM_PREFIX`'mulredc`'2,`function') + +# Implements multiplication and REDC for two input numbers of LENGTH words +ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') +# tmp[0 ... len+1] = 0 +# for (i = 0; i < len; i++) +# { +# t = x[i] * y[0]; /* Keep and reuse this product */ +# u = ((t + tmp[0]) * invm) % 2^64 +# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ +# for (j = 1; j < len; j++) +# { +# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); +# /* put new carry in cy */ +# } +# tmp[len] = cy; +# } +# z[0 ... len-1] = tmp[0 ... len-1] +# return (tmp[len]) + + +# Values that are referenced only once in the loop over j go into r8 .. r14, +# In the inner loop (over j), tmp, x[i], y, m, and u are constant. +# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values +# stay in registers and are referenced as +# TP = tmp, YP = y, MP = m, +# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry + +define(`T0', `rsi')dnl +define(`T0l', `esi')dnl +define(`T1', `rbx')dnl +define(`T1l', `ebx')dnl +define(`CY', `rcx')dnl +define(`CYl', `ecx')dnl +define(`CYb', `cl')dnl +define(`XI', `r14')dnl # register that holds x[i] value +define(`U', `r11')dnl +define(`XP', `r13')dnl # register that points to the x arraz +define(`TP', `rbp')dnl # register that points to t + i +define(`I', `r12')dnl # register that holds loop counter i +define(`Il', `r12d')dnl # register that holds loop counter i +define(`ZP', `rdi')dnl # register that holds z. Same as passed in +ifdef(`WINDOWS64_ABI', +`define(`YP', `r8')dnl # points to y array, same as passed in +define(`MP', `r9')dnl # points to m array, same as passed in +define(`INVM', `r10')dnl # register that holds invm. Same as passed in' +, +`define(`YP', `r9')dnl # register that points to the y array +define(`MP', `r10')dnl # register that points to the m array +define(`INVM', `r8')dnl # register that holds invm. Same as passed in' +)dnl + +`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U +`#' `YP' = YP, `MP' = MP, `TP' = TP + +# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words +# The tmp array needs LENGTH+1 entries, the last one is so that we can +# store CY at tmp[j+1] for j == len-1 + + + +GSYM_PREFIX`'mulredc2: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 +ifdef(`WINDOWS64_ABI', +` pushq %rsi + pushq %rdi +') dnl +ifdef(`WINDOWS64_ABI', +` movq %rdx, %XP + movq %rcx, %ZP + movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' +, +` movq %rsi, %XP # store x in XP + movq %rdx, %YP # store y in YP + movq %rcx, %MP # store m in MP' +) dnl + subq $24, %rsp # subtract size of local vars + + +######################################################################### +# i = 0 pass +######################################################################### + +# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m +# %CY < 255 (i.e. only low byte may be != 0) + +# Pass for j = 0. We need to fetch x[i] from memory and compute the new u + + movq (%XP), %XI # XI = x[0] + movq (%YP), %rax # rax = y[0] + + xorl %CYl, %CYl # set %CY to 0 + lea (%rsp), %TP # store addr of tmp array in TP + movl %CYl, %Il # Set %I to 0 + + mulq %XI # rdx:rax = y[0] * x[i] + addq $1, %I + + movq %rax, %T0 # Move low word of product to T0 + movq %rdx, %T1 # Move high word of product to T1 + +ifdef(`MULREDC_SVOBODA', +, `' +` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' +) movq %rax, %U # this is the new u value + + mulq (%MP) # multipy u*m[0] + addq %rax, %T0 # Now %T0 = 0, need not be stored + movq 8(%YP), %rax # Fetch y[1] + adcq %rdx, %T1 # + setc %CYb + # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence + # CY:T1 <= 2*2^64 - 4 + +define(`TT', defn(`T0'))dnl +define(`TTl', defn(`T0l'))dnl +define(`T0', defn(`T1'))dnl +define(`T0l', defn(`T1l'))dnl +define(`T1', defn(`TT'))dnl +define(`T1l', defn(`TTl'))dnl +undefine(`TT')dnl +undefine(`TTl')dnl +`#' Now `T0' = T0, `T1' = T1 + + +`#' Pass for j = 1. Don't fetch new data from y[j+1]. + + movl %CYl, %T1l # T1 = CY <= 1 + + mulq %XI # y[j] * x[i] + addq %rax, %T0 # Add low word to T0 + movq 8(%MP), %rax # Fetch m[j] into %rax + adcq %rdx, %T1 # Add high word with carry to T1 + mulq %U # m[j]*u + addq %rax, %T0 # Add low word to T0 + movq %T0, 0(%TP) # Store T0 in tmp[j-1] + adcq %rdx, %T1 # Add high word with carry to T1 + movq %T1, 8(%TP) # Store T1 in tmp[j] + setc %CYb # %CY <= 1 + movq %CY, 16(%TP) # Store CY in tmp[j+1] + +######################################################################### +# i > 0 passes +######################################################################### + +.p2align 5,,4 +LABEL_SUFFIX(1) + +# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m +# %CY < 255 (i.e. only low byte may be > 0) + +# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory +# and compute the new u + + movq (%XP,%I,8), %XI # XI = x[i] + movq (%YP), %rax # rax = y[0] +#init the register tmp ring buffer + movq (%TP), %T0 # Load tmp[0] into T0 + movq 8(%TP), %T1 # Load tmp[1] into T1 + + mulq %XI # rdx:rax = y[0] * x[i] + addq $1, %I + + addq %T0, %rax # Add T0 to low word + adcq %rdx, %T1 # Add high word with carry to T1 + setc %CYb # %CY <= 1 + + movq %rax, %T0 # Save sum of low words in T0 + imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 + movq %rax, %U # this is the new u value + + mulq (%MP) # multipy u*m[0] + addq %rax, %T0 # Now %T0 = 0, need not be stored + adcq %rdx, %T1 # + + movq 8(%YP), %rax # Fetch y[1] + +define(`TT', defn(`T0'))dnl +define(`TTl', defn(`T0l'))dnl +define(`T0', defn(`T1'))dnl +define(`T0l', defn(`T1l'))dnl +define(`T1', defn(`TT'))dnl +define(`T1l', defn(`TTl'))dnl +undefine(`TT')dnl +undefine(`TTl')dnl +`#' Now `T0' = T0, `T1' = T1 + + +`#' Pass for j = 1. Don't fetch new data from y[j+1]. + + movq 16(%TP), %T1 + adcq %CY, %T1 # T1 = CY + tmp[j+1] + + mulq %XI # y[j] * x[i] + addq %rax, %T0 # Add low word to T0 + movq 8(%MP), %rax # Fetch m[j] into %rax + adcq %rdx, %T1 # Add high word with carry to T1 + setc %CYb # %CY <= 1 + mulq %U # m[j]*u + addq %rax, %T0 # Add low word to T0 + movq %T0, 0(%TP) # Store T0 in tmp[j-1] + adcq %rdx, %T1 # Add high word with carry to T1 + movq %T1, 8(%TP) # Store T1 in tmp[j] + adcb $0, %CYb # %CY <= 2 + movq %CY, 16(%TP) # Store CY in tmp[j+1] + + cmpq $2, %I + jb 1b + +# Copy result from tmp memory to z + movq (%TP), %rax + movq 8(%TP), %rdx + movq %rax, (%ZP) + movq %rdx, 8(%ZP) + + movl %CYl, %eax # use carry as return value + addq $24, %rsp +ifdef(`WINDOWS64_ABI', +` popq %rdi + popq %rsi +') dnl + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +ifdef(`WINDOWS64_ABI', +, +` +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +') dnl """ else: print mulredc_k_rolled(k) diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc10.asm gmp-ecm-7.0.5+ds/x86_64/mulredc10.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc10.asm 2016-08-25 13:42:10.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc10.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,858 +0,0 @@ -# mp_limb_t mulredc10(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc10 - TYPE(GSYM_PREFIX`'mulredc`'10,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc10: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $88, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 64(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 72(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 80(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9. Don't fetch new data from y[j+1]. - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 64(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 72(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 80(%TP) # Store CY in tmp[j+1] - - cmpq $10, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - - movl %CYl, %eax # use carry as return value - addq $88, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_10.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_10.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_10.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_10.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,397 +0,0 @@ -# mp_limb_t mulredc1_10(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_10 - TYPE(GSYM_PREFIX`'mulredc1_`'10,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_10: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 64(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 72(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_11.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_11.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_11.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_11.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,431 +0,0 @@ -# mp_limb_t mulredc1_11(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_11 - TYPE(GSYM_PREFIX`'mulredc1_`'11,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_11: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 72(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 80(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_12.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_12.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_12.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_12.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,465 +0,0 @@ -# mp_limb_t mulredc1_12(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_12 - TYPE(GSYM_PREFIX`'mulredc1_`'12,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_12: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 80(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 88(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_13.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_13.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_13.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_13.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,499 +0,0 @@ -# mp_limb_t mulredc1_13(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_13 - TYPE(GSYM_PREFIX`'mulredc1_`'13,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_13: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 88(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 96(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_14.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_14.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_14.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_14.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,533 +0,0 @@ -# mp_limb_t mulredc1_14(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_14 - TYPE(GSYM_PREFIX`'mulredc1_`'14,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_14: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 96(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 104(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_15.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_15.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_15.asm 2016-08-25 13:42:14.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_15.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,567 +0,0 @@ -# mp_limb_t mulredc1_15(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_15 - TYPE(GSYM_PREFIX`'mulredc1_`'15,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_15: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 96(ZP) # Store T0 in z[13-1] - movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 14. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 112(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 104(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 112(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_16.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_16.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_16.asm 2016-08-25 13:42:14.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_16.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,601 +0,0 @@ -# mp_limb_t mulredc1_16(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_16 - TYPE(GSYM_PREFIX`'mulredc1_`'16,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_16: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 96(ZP) # Store T0 in z[13-1] - movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 14 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 112(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 104(ZP) # Store T0 in z[14-1] - movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 15. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 120(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 112(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 120(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_17.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_17.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_17.asm 2016-08-25 13:42:14.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_17.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,635 +0,0 @@ -# mp_limb_t mulredc1_17(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_17 - TYPE(GSYM_PREFIX`'mulredc1_`'17,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_17: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 96(ZP) # Store T0 in z[13-1] - movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 14 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 112(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 104(ZP) # Store T0 in z[14-1] - movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 15 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 120(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 112(ZP) # Store T0 in z[15-1] - movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 16. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 128(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 120(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 128(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_18.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_18.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_18.asm 2016-08-25 13:42:14.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_18.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,669 +0,0 @@ -# mp_limb_t mulredc1_18(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_18 - TYPE(GSYM_PREFIX`'mulredc1_`'18,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_18: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 96(ZP) # Store T0 in z[13-1] - movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 14 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 112(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 104(ZP) # Store T0 in z[14-1] - movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 15 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 120(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 112(ZP) # Store T0 in z[15-1] - movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 16 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 128(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 120(ZP) # Store T0 in z[16-1] - movq 136(YP), %rax # Fetch y[j+1] = y[17] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 17. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 136(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 128(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 136(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_19.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_19.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_19.asm 2016-08-25 13:42:14.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_19.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,703 +0,0 @@ -# mp_limb_t mulredc1_19(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_19 - TYPE(GSYM_PREFIX`'mulredc1_`'19,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_19: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 96(ZP) # Store T0 in z[13-1] - movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 14 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 112(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 104(ZP) # Store T0 in z[14-1] - movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 15 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 120(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 112(ZP) # Store T0 in z[15-1] - movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 16 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 128(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 120(ZP) # Store T0 in z[16-1] - movq 136(YP), %rax # Fetch y[j+1] = y[17] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 17 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 136(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 128(ZP) # Store T0 in z[17-1] - movq 144(YP), %rax # Fetch y[j+1] = y[18] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 18. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 144(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 136(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 144(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc11.asm gmp-ecm-7.0.5+ds/x86_64/mulredc11.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc11.asm 2016-08-25 13:42:10.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc11.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,933 +0,0 @@ -# mp_limb_t mulredc11(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc11 - TYPE(GSYM_PREFIX`'mulredc`'11,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc11: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $96, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 72(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 80(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 88(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10. Don't fetch new data from y[j+1]. - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 72(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 80(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 88(%TP) # Store CY in tmp[j+1] - - cmpq $11, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq %rax, 80(%ZP) - - movl %CYl, %eax # use carry as return value - addq $96, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_20.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_20.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_20.asm 2016-08-25 13:42:14.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_20.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,737 +0,0 @@ -# mp_limb_t mulredc1_20(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_20 - TYPE(GSYM_PREFIX`'mulredc1_`'20,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_20: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 56(ZP) # Store T0 in z[8-1] - movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 9 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 72(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 64(ZP) # Store T0 in z[9-1] - movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 10 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 80(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 72(ZP) # Store T0 in z[10-1] - movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 11 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 88(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 80(ZP) # Store T0 in z[11-1] - movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 12 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 96(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 88(ZP) # Store T0 in z[12-1] - movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 13 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 104(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 96(ZP) # Store T0 in z[13-1] - movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 14 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 112(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 104(ZP) # Store T0 in z[14-1] - movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 15 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 120(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 112(ZP) # Store T0 in z[15-1] - movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 16 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 128(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 120(ZP) # Store T0 in z[16-1] - movq 136(YP), %rax # Fetch y[j+1] = y[17] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 17 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 136(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 128(ZP) # Store T0 in z[17-1] - movq 144(YP), %rax # Fetch y[j+1] = y[18] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 18 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 144(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 136(ZP) # Store T0 in z[18-1] - movq 152(YP), %rax # Fetch y[j+1] = y[19] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 19. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 152(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 144(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 152(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_2.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_2.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_2.asm 2016-08-25 13:42:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,125 +0,0 @@ -# mp_limb_t mulredc1_2(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_2 - TYPE(GSYM_PREFIX`'mulredc1_`'2,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_2: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 0(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 8(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc12.asm gmp-ecm-7.0.5+ds/x86_64/mulredc12.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc12.asm 2016-08-25 13:42:10.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc12.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1008 +0,0 @@ -# mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc12 - TYPE(GSYM_PREFIX`'mulredc`'12,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc12: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $104, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 80(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 88(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 96(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11. Don't fetch new data from y[j+1]. - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 80(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 88(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 96(%TP) # Store CY in tmp[j+1] - - cmpq $12, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - - movl %CYl, %eax # use carry as return value - addq $104, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_3.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_3.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_3.asm 2016-08-25 13:42:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,159 +0,0 @@ -# mp_limb_t mulredc1_3(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_3 - TYPE(GSYM_PREFIX`'mulredc1_`'3,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_3: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 8(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 16(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc13.asm gmp-ecm-7.0.5+ds/x86_64/mulredc13.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc13.asm 2016-08-25 13:42:10.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc13.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1083 +0,0 @@ -# mp_limb_t mulredc13(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc13 - TYPE(GSYM_PREFIX`'mulredc`'13,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc13: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $112, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 88(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 96(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 104(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12. Don't fetch new data from y[j+1]. - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 88(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 96(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 104(%TP) # Store CY in tmp[j+1] - - cmpq $13, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq %rax, 96(%ZP) - - movl %CYl, %eax # use carry as return value - addq $112, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_4.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_4.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_4.asm 2016-08-25 13:42:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_4.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,193 +0,0 @@ -# mp_limb_t mulredc1_4(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_4 - TYPE(GSYM_PREFIX`'mulredc1_`'4,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_4: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 16(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 24(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc14.asm gmp-ecm-7.0.5+ds/x86_64/mulredc14.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc14.asm 2016-08-25 13:42:10.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc14.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1158 +0,0 @@ -# mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc14 - TYPE(GSYM_PREFIX`'mulredc`'14,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc14: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $120, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 96(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 104(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 112(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13. Don't fetch new data from y[j+1]. - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 96(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 104(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 112(%TP) # Store CY in tmp[j+1] - - cmpq $14, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - - movl %CYl, %eax # use carry as return value - addq $120, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_5.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_5.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_5.asm 2016-08-25 13:42:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_5.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,227 +0,0 @@ -# mp_limb_t mulredc1_5(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_5 - TYPE(GSYM_PREFIX`'mulredc1_`'5,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_5: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 24(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 32(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc15.asm gmp-ecm-7.0.5+ds/x86_64/mulredc15.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc15.asm 2016-08-25 13:42:11.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc15.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1233 +0,0 @@ -# mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc15 - TYPE(GSYM_PREFIX`'mulredc`'15,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc15: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $128, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 104(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 112(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 120(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 104(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14. Don't fetch new data from y[j+1]. - - movq 120(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 104(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 112(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 120(%TP) # Store CY in tmp[j+1] - - cmpq $15, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - movq 112(%TP), %rax - movq %rax, 112(%ZP) - - movl %CYl, %eax # use carry as return value - addq $128, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_6.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_6.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_6.asm 2016-08-25 13:42:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_6.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,261 +0,0 @@ -# mp_limb_t mulredc1_6(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_6 - TYPE(GSYM_PREFIX`'mulredc1_`'6,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_6: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 32(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 40(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc16.asm gmp-ecm-7.0.5+ds/x86_64/mulredc16.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc16.asm 2016-08-25 13:42:11.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc16.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1308 +0,0 @@ -# mp_limb_t mulredc16(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc16 - TYPE(GSYM_PREFIX`'mulredc`'16,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc16: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $136, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 120(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 112(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 120(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 128(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 104(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 120(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 112(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15. Don't fetch new data from y[j+1]. - - movq 128(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 120(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 112(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 120(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 128(%TP) # Store CY in tmp[j+1] - - cmpq $16, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - movq 112(%TP), %rax - movq 120(%TP), %rdx - movq %rax, 112(%ZP) - movq %rdx, 120(%ZP) - - movl %CYl, %eax # use carry as return value - addq $136, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_7.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_7.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_7.asm 2016-08-25 13:42:12.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_7.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,295 +0,0 @@ -# mp_limb_t mulredc1_7(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_7 - TYPE(GSYM_PREFIX`'mulredc1_`'7,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_7: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 40(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 48(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc17.asm gmp-ecm-7.0.5+ds/x86_64/mulredc17.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc17.asm 2016-08-25 13:42:11.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc17.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1383 +0,0 @@ -# mp_limb_t mulredc17(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc17 - TYPE(GSYM_PREFIX`'mulredc`'17,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc17: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $144, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 120(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 128(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 120(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 128(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 136(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 104(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 120(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 112(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 128(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 120(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16. Don't fetch new data from y[j+1]. - - movq 136(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 128(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 120(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 128(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 136(%TP) # Store CY in tmp[j+1] - - cmpq $17, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - movq 112(%TP), %rax - movq 120(%TP), %rdx - movq %rax, 112(%ZP) - movq %rdx, 120(%ZP) - movq 128(%TP), %rax - movq %rax, 128(%ZP) - - movl %CYl, %eax # use carry as return value - addq $144, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_8.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_8.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_8.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_8.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,329 +0,0 @@ -# mp_limb_t mulredc1_8(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_8 - TYPE(GSYM_PREFIX`'mulredc1_`'8,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_8: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 48(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 56(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc18.asm gmp-ecm-7.0.5+ds/x86_64/mulredc18.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc18.asm 2016-08-25 13:42:11.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc18.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1458 +0,0 @@ -# mp_limb_t mulredc18(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc18 - TYPE(GSYM_PREFIX`'mulredc`'18,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc18: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $152, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 120(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 128(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 120(%TP) `#' Store T0 in tmp[16-1] - movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 17. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 136(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 128(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 136(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 144(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 104(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 120(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 112(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 128(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 120(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 136(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 128(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 120(%TP) `#' Store T0 in tmp[16-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 17. Don't fetch new data from y[j+1]. - - movq 144(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 136(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 128(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 136(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 144(%TP) # Store CY in tmp[j+1] - - cmpq $18, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - movq 112(%TP), %rax - movq 120(%TP), %rdx - movq %rax, 112(%ZP) - movq %rdx, 120(%ZP) - movq 128(%TP), %rax - movq 136(%TP), %rdx - movq %rax, 128(%ZP) - movq %rdx, 136(%ZP) - - movl %CYl, %eax # use carry as return value - addq $152, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1_9.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1_9.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1_9.asm 2016-08-25 13:42:13.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1_9.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,363 +0,0 @@ -# mp_limb_t mulredc1_9(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - - - -include(`config.m4') - -ifdef(`WINDOWS64_ABI', -`define(`Y_PARAM', `%r8')dnl -define(`INVM_PARAM',`72(%rsp)')dnl' -, -`define(`Y_PARAM', `%rdx')dnl -define(`INVM_PARAM',`%r8')dnl' -)dnl - TEXT -.p2align 6 # Opteron L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc1_9 - TYPE(GSYM_PREFIX`'mulredc1_`'9,`function') - -# Implements multiplication and REDC for one input numbers of LENGTH words -# and a multiplier of one word -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# YP = y, MP = m, -# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `%rsi')dnl -define(`T1', `%rbx')dnl -define(`CY', `%rcx')dnl -define(`CYl', `%ecx')dnl -define(`CYb', `%cl')dnl -define(`X', `%r14')dnl # register that holds x value -define(`U', `%r11')dnl -define(`YP', `%r9')dnl # register that points to the y array -define(`MP', `%r10')dnl # register that points to the m array -define(`ZP', `%rdi')dnl # register that holds z - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U -`#' `YP' = YP, `MP' = MP - -GSYM_PREFIX`'mulredc1_9: - - -######################################################################### -# i = 0 pass -######################################################################### - -`#' register values at loop entry: YP = y, MP = m - -# We need to compute u - - movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) - pushq %rbx - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi - movq %r9, MP # store m in MP - movq Y_PARAM, YP - movq %rcx, ZP - movq %rdx, X' -, -` movq Y_PARAM, YP - movq %rcx, MP - movq %rsi, X # store x in X - # ZP is same as passed in' -) - - xorl CYl, CYl # set %CY to 0 - - mulq X # rdx:rax = y[0] * x - - movq %rax, T0 # Move low word of product to T0 - movq %rdx, T1 # Move high word of product to T1 - - imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, U # this is the new u value - - mulq (MP) # multipy u*m[0] - addq %rax, T0 # Now %T0 = 0, need not be stored - movq 8(YP), %rax # Fetch y[1] - adcq %rdx, T1 # - setc CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 1 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 8(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 0(ZP) # Store T0 in z[1-1] - movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 2 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 16(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 8(ZP) # Store T0 in z[2-1] - movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 3 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 24(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 16(ZP) # Store T0 in z[3-1] - movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 4 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 32(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 24(ZP) # Store T0 in z[4-1] - movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 5 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 40(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 32(ZP) # Store T0 in z[5-1] - movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 6 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 48(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 40(ZP) # Store T0 in z[6-1] - movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 7 -# Register values at entry: -# %rax = y[j], X = x, U = u -# T0 = value to store in tmp[j], T1 undefined -# CY = carry into T1 (is <= 2) -# We have CY:T1 <= 2 * 2^64 - 2 - - movq CY, T1 # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq X # y[j] * x - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, T0 # Add low word to T0 - movq 56(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq T0, %rax # Add T0 and low word - movq %rax, 48(ZP) # Store T0 in z[7-1] - movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - setc CYb # CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`T0', defn(`T1'))dnl -define(`T1', defn(`TT'))dnl -undefine(`TT')dnl -`#' Now `T0' = T0, `T1' = T1 - - -# Pass for j = 8. Don't fetch new data from y[j+1]. - - movq CY, T1 # T1 = CY <= 1 - - mulq X # y[j] * x[i] - addq %rax, T0 # Add low word to T0 - movq 64(MP), %rax # Fetch m[j] into %rax - adcq %rdx, T1 # Add high word with carry to T1 - mulq U # m[j]*u - addq %rax, T0 # Add low word to T0 - movq T0, 56(ZP) # Store T0 in z[j-1] - adcq %rdx, T1 # Add high word with carry to T1 - movq T1, 64(ZP) # Store T1 in tmp[j] - setc CYb # %CY <= 1 - - movq CY, %rax # use carry as return value -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc19.asm gmp-ecm-7.0.5+ds/x86_64/mulredc19.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc19.asm 2016-08-25 13:42:11.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc19.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1533 +0,0 @@ -# mp_limb_t mulredc19(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc19 - TYPE(GSYM_PREFIX`'mulredc`'19,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc19: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $160, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 120(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 128(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 120(%TP) `#' Store T0 in tmp[16-1] - movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 17 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 136(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 128(%TP) `#' Store T0 in tmp[17-1] - movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 18. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 144(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 136(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 144(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 152(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 104(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 120(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 112(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 128(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 120(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 136(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 128(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 120(%TP) `#' Store T0 in tmp[16-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 17 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 144(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 136(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 128(%TP) `#' Store T0 in tmp[17-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 18. Don't fetch new data from y[j+1]. - - movq 152(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 144(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 136(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 144(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 152(%TP) # Store CY in tmp[j+1] - - cmpq $19, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - movq 112(%TP), %rax - movq 120(%TP), %rdx - movq %rax, 112(%ZP) - movq %rdx, 120(%ZP) - movq 128(%TP), %rax - movq 136(%TP), %rdx - movq %rax, 128(%ZP) - movq %rdx, 136(%ZP) - movq 144(%TP), %rax - movq %rax, 144(%ZP) - - movl %CYl, %eax # use carry as return value - addq $160, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1.asm gmp-ecm-7.0.5+ds/x86_64/mulredc1.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc1.asm 2016-08-25 13:40:28.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1.asm 2022-06-06 14:16:49.000000000 +0000 @@ -52,3 +52,11 @@ movq %rdx, (Z) adcq $0, %rax ret + +ifdef(`WINDOWS64_ABI', +, +` +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +') dnl diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc1.m4 gmp-ecm-7.0.5+ds/x86_64/mulredc1.m4 --- gmp-ecm-7.0.4+ds/x86_64/mulredc1.m4 2016-08-25 13:40:56.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc1.m4 2022-06-06 14:16:49.000000000 +0000 @@ -174,3 +174,11 @@ popq %r14 popq %rbx ret + +`ifdef(`WINDOWS64_ABI', +, +` +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +') dnl' diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc20.asm gmp-ecm-7.0.5+ds/x86_64/mulredc20.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc20.asm 2016-08-25 13:42:11.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc20.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1608 +0,0 @@ -# mp_limb_t mulredc20(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc20 - TYPE(GSYM_PREFIX`'mulredc`'20,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc20: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $168, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 72(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 80(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 88(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 96(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 104(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 112(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 120(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 128(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 120(%TP) `#' Store T0 in tmp[16-1] - movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 17 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 136(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 128(%TP) `#' Store T0 in tmp[17-1] - movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 18 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 144(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 136(%TP) `#' Store T0 in tmp[18-1] - movq 152(%YP), %rax `#' Fetch y[j+1] = y[19] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 19. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 152(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 144(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 152(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 160(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 64(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 9 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 80(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 72(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 10 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 88(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 80(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 11 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 96(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 88(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 12 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 104(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 96(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 13 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 112(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 104(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 14 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 120(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 112(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 15 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 128(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 120(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 16 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 136(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 128(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 120(%TP) `#' Store T0 in tmp[16-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 17 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 144(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 136(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 128(%TP) `#' Store T0 in tmp[17-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 18 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 152(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 144(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 152(%YP), %rax `#' Fetch y[j+1] = y[19] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 136(%TP) `#' Store T0 in tmp[18-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 19. Don't fetch new data from y[j+1]. - - movq 160(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 152(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 144(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 152(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 160(%TP) # Store CY in tmp[j+1] - - cmpq $20, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq 72(%TP), %rdx - movq %rax, 64(%ZP) - movq %rdx, 72(%ZP) - movq 80(%TP), %rax - movq 88(%TP), %rdx - movq %rax, 80(%ZP) - movq %rdx, 88(%ZP) - movq 96(%TP), %rax - movq 104(%TP), %rdx - movq %rax, 96(%ZP) - movq %rdx, 104(%ZP) - movq 112(%TP), %rax - movq 120(%TP), %rdx - movq %rax, 112(%ZP) - movq %rdx, 120(%ZP) - movq 128(%TP), %rax - movq 136(%TP), %rdx - movq %rax, 128(%ZP) - movq %rdx, 136(%ZP) - movq 144(%TP), %rax - movq 152(%TP), %rdx - movq %rax, 144(%ZP) - movq %rdx, 152(%ZP) - - movl %CYl, %eax # use carry as return value - addq $168, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc2.asm gmp-ecm-7.0.5+ds/x86_64/mulredc2.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc2.asm 2016-08-25 13:42:08.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,258 +0,0 @@ -# mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc2 - TYPE(GSYM_PREFIX`'mulredc`'2,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc2: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $24, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 0(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 8(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 16(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1. Don't fetch new data from y[j+1]. - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 0(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 8(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 16(%TP) # Store CY in tmp[j+1] - - cmpq $2, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - - movl %CYl, %eax # use carry as return value - addq $24, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc3.asm gmp-ecm-7.0.5+ds/x86_64/mulredc3.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc3.asm 2016-08-25 13:42:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,333 +0,0 @@ -# mp_limb_t mulredc3(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc3 - TYPE(GSYM_PREFIX`'mulredc`'3,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc3: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $32, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 8(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 16(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 24(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2. Don't fetch new data from y[j+1]. - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 8(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 16(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 24(%TP) # Store CY in tmp[j+1] - - cmpq $3, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq %rax, 16(%ZP) - - movl %CYl, %eax # use carry as return value - addq $32, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc4.asm gmp-ecm-7.0.5+ds/x86_64/mulredc4.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc4.asm 2016-08-25 13:42:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc4.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,408 +0,0 @@ -# mp_limb_t mulredc4(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc4 - TYPE(GSYM_PREFIX`'mulredc`'4,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc4: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $40, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 16(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 24(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 32(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3. Don't fetch new data from y[j+1]. - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 16(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 24(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 32(%TP) # Store CY in tmp[j+1] - - cmpq $4, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - - movl %CYl, %eax # use carry as return value - addq $40, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc5.asm gmp-ecm-7.0.5+ds/x86_64/mulredc5.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc5.asm 2016-08-25 13:42:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc5.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,483 +0,0 @@ -# mp_limb_t mulredc5(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc5 - TYPE(GSYM_PREFIX`'mulredc`'5,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc5: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $48, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 24(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 32(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 40(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4. Don't fetch new data from y[j+1]. - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 24(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 32(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 40(%TP) # Store CY in tmp[j+1] - - cmpq $5, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq %rax, 32(%ZP) - - movl %CYl, %eax # use carry as return value - addq $48, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc6.asm gmp-ecm-7.0.5+ds/x86_64/mulredc6.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc6.asm 2016-08-25 13:42:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc6.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,558 +0,0 @@ -# mp_limb_t mulredc6(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc6 - TYPE(GSYM_PREFIX`'mulredc`'6,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc6: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $56, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 32(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 40(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 48(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5. Don't fetch new data from y[j+1]. - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 32(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 40(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 48(%TP) # Store CY in tmp[j+1] - - cmpq $6, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - - movl %CYl, %eax # use carry as return value - addq $56, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc7.asm gmp-ecm-7.0.5+ds/x86_64/mulredc7.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc7.asm 2016-08-25 13:42:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc7.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,633 +0,0 @@ -# mp_limb_t mulredc7(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc7 - TYPE(GSYM_PREFIX`'mulredc`'7,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc7: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $64, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 40(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 48(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 56(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6. Don't fetch new data from y[j+1]. - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 40(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 48(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 56(%TP) # Store CY in tmp[j+1] - - cmpq $7, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq %rax, 48(%ZP) - - movl %CYl, %eax # use carry as return value - addq $64, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc8.asm gmp-ecm-7.0.5+ds/x86_64/mulredc8.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc8.asm 2016-08-25 13:42:09.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc8.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,708 +0,0 @@ -# mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc8 - TYPE(GSYM_PREFIX`'mulredc`'8,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc8: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $72, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 48(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 56(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 64(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7. Don't fetch new data from y[j+1]. - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 48(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 56(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 64(%TP) # Store CY in tmp[j+1] - - cmpq $8, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - - movl %CYl, %eax # use carry as return value - addq $72, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc9.asm gmp-ecm-7.0.5+ds/x86_64/mulredc9.asm --- gmp-ecm-7.0.4+ds/x86_64/mulredc9.asm 2016-08-25 13:42:10.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc9.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,783 +0,0 @@ -# mp_limb_t mulredc9(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, -# const mp_limb_t *m, mp_limb_t inv_m); -# -# Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 -# Needs %rbx, %rsp, %rbp, %r12-%r15 restored -# Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) -# Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored - -# This stuff is run through M4 twice, first when generating the -# mulredc*.asm files from the mulredc.m4 file (when preparing the distro) -# and again when generating the mulredc*.s files from the mulredc*.asm files -# when the user compiles the program. -# We used to substitute XP etc. by register names in the first pass, -# but now with switching between Linux and Windows ABI, we do it in -# the second pass instead when we know which ABI we have, as that -# allows us to assign registers differently for the two ABIs. -# That means that the defines for XP etc., need to be quoted once to be -# protected in the first M4 pass, so that they are processed and -# occurrences of XP etc. happen only in the second pass. - - - -include(`config.m4') - - TEXT -.p2align 6 # x86_64 L1 code cache line is 64 bytes long - GLOBL GSYM_PREFIX`'mulredc9 - TYPE(GSYM_PREFIX`'mulredc`'9,`function') - -# Implements multiplication and REDC for two input numbers of LENGTH words -ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') -# tmp[0 ... len+1] = 0 -# for (i = 0; i < len; i++) -# { -# t = x[i] * y[0]; /* Keep and reuse this product */ -# u = ((t + tmp[0]) * invm) % 2^64 -# tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ -# for (j = 1; j < len; j++) -# { -# tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); -# /* put new carry in cy */ -# } -# tmp[len] = cy; -# } -# z[0 ... len-1] = tmp[0 ... len-1] -# return (tmp[len]) - - -# Values that are referenced only once in the loop over j go into r8 .. r14, -# In the inner loop (over j), tmp, x[i], y, m, and u are constant. -# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values -# stay in registers and are referenced as -# TP = tmp, YP = y, MP = m, -# XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry - -define(`T0', `rsi')dnl -define(`T0l', `esi')dnl -define(`T1', `rbx')dnl -define(`T1l', `ebx')dnl -define(`CY', `rcx')dnl -define(`CYl', `ecx')dnl -define(`CYb', `cl')dnl -define(`XI', `r14')dnl # register that holds x[i] value -define(`U', `r11')dnl -define(`XP', `r13')dnl # register that points to the x arraz -define(`TP', `rbp')dnl # register that points to t + i -define(`I', `r12')dnl # register that holds loop counter i -define(`Il', `r12d')dnl # register that holds loop counter i -define(`ZP', `rdi')dnl # register that holds z. Same as passed in -ifdef(`WINDOWS64_ABI', -`define(`YP', `r8')dnl # points to y array, same as passed in -define(`MP', `r9')dnl # points to m array, same as passed in -define(`INVM', `r10')dnl # register that holds invm. Same as passed in' -, -`define(`YP', `r9')dnl # register that points to the y array -define(`MP', `r10')dnl # register that points to the m array -define(`INVM', `r8')dnl # register that holds invm. Same as passed in' -)dnl - -`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U -`#' `YP' = YP, `MP' = MP, `TP' = TP - -# local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words -# The tmp array needs LENGTH+1 entries, the last one is so that we can -# store CY at tmp[j+1] for j == len-1 - - - -GSYM_PREFIX`'mulredc9: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 -ifdef(`WINDOWS64_ABI', -` pushq %rsi - pushq %rdi -') dnl -ifdef(`WINDOWS64_ABI', -` movq %rdx, %XP - movq %rcx, %ZP - movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' -, -` movq %rsi, %XP # store x in XP - movq %rdx, %YP # store y in YP - movq %rcx, %MP # store m in MP' -) dnl - subq $80, %rsp # subtract size of local vars - - -######################################################################### -# i = 0 pass -######################################################################### - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be != 0) - -# Pass for j = 0. We need to fetch x[i] from memory and compute the new u - - movq (%XP), %XI # XI = x[0] - movq (%YP), %rax # rax = y[0] - - xorl %CYl, %CYl # set %CY to 0 - lea (%rsp), %TP # store addr of tmp array in TP - movl %CYl, %Il # Set %I to 0 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - movq %rax, %T0 # Move low word of product to T0 - movq %rdx, %T1 # Move high word of product to T1 - -ifdef(`MULREDC_SVOBODA', -, `' -` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' -) movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - movq 8(%YP), %rax # Fetch y[1] - adcq %rdx, %T1 # - setc %CYb - # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence - # CY:T1 <= 2*2^64 - 4 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 8(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 16(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 24(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 32(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 40(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 48(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined -`#' %CY = carry into T1 (is <= 2) -# We have %CY:%T1 <= 2 * 2^64 - 2 - - movl %CYl, %T1l # T1 = CY <= 1 - - # Here, T1:T0 <= 2*2^64 - 2 - mulq %XI # y[j] * x[i] - # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 - addq %rax, %T0 # Add low word to T0 - movq 56(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! - - mulq %U # m[j]*u - # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 - addq %T0, %rax # Add T0 and low word - movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= - # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8. Don't fetch new data from y[j+1]. - - movl %CYl, %T1l # T1 = CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 56(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 64(%TP) # Store T1 in tmp[j] - setc %CYb # %CY <= 1 - movq %CY, 72(%TP) # Store CY in tmp[j+1] - -######################################################################### -# i > 0 passes -######################################################################### - -.p2align 5,,4 -LABEL_SUFFIX(1) - -# register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m -# %CY < 255 (i.e. only low byte may be > 0) - -# Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory -# and compute the new u - - movq (%XP,%I,8), %XI # XI = x[i] - movq (%YP), %rax # rax = y[0] -#init the register tmp ring buffer - movq (%TP), %T0 # Load tmp[0] into T0 - movq 8(%TP), %T1 # Load tmp[1] into T1 - - mulq %XI # rdx:rax = y[0] * x[i] - addq $1, %I - - addq %T0, %rax # Add T0 to low word - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - - movq %rax, %T0 # Save sum of low words in T0 - imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 - movq %rax, %U # this is the new u value - - mulq (%MP) # multipy u*m[0] - addq %rax, %T0 # Now %T0 = 0, need not be stored - adcq %rdx, %T1 # - - movq 8(%YP), %rax # Fetch y[1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 1 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 16(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 8(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 2 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 24(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 16(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 3 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 32(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 24(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 4 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 40(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 32(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 5 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 48(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 40(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 6 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 56(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 48(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 7 -`#' Register values at entry: -`#' %rax = y[j], %XI = x[i], %U = u -`#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in -`#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 - - movq 64(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - setc %CYb # %CY <= 1 - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - - movq %U, %rax - adcq %rdx, %T1 # Add high word with carry to T1 - adcb $0, %CYb # %CY <= 2 - - mulq 56(%MP) # m[j]*u - addq %rax, %T0 # Add T0 and low word - - movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] - -define(`TT', defn(`T0'))dnl -define(`TTl', defn(`T0l'))dnl -define(`T0', defn(`T1'))dnl -define(`T0l', defn(`T1l'))dnl -define(`T1', defn(`TT'))dnl -define(`T1l', defn(`TTl'))dnl -undefine(`TT')dnl -undefine(`TTl')dnl -`#' Now `T0' = T0, `T1' = T1 - - -`#' Pass for j = 8. Don't fetch new data from y[j+1]. - - movq 72(%TP), %T1 - adcq %CY, %T1 # T1 = CY + tmp[j+1] - - mulq %XI # y[j] * x[i] - addq %rax, %T0 # Add low word to T0 - movq 64(%MP), %rax # Fetch m[j] into %rax - adcq %rdx, %T1 # Add high word with carry to T1 - setc %CYb # %CY <= 1 - mulq %U # m[j]*u - addq %rax, %T0 # Add low word to T0 - movq %T0, 56(%TP) # Store T0 in tmp[j-1] - adcq %rdx, %T1 # Add high word with carry to T1 - movq %T1, 64(%TP) # Store T1 in tmp[j] - adcb $0, %CYb # %CY <= 2 - movq %CY, 72(%TP) # Store CY in tmp[j+1] - - cmpq $9, %I - jb 1b - -# Copy result from tmp memory to z - movq (%TP), %rax - movq 8(%TP), %rdx - movq %rax, (%ZP) - movq %rdx, 8(%ZP) - movq 16(%TP), %rax - movq 24(%TP), %rdx - movq %rax, 16(%ZP) - movq %rdx, 24(%ZP) - movq 32(%TP), %rax - movq 40(%TP), %rdx - movq %rax, 32(%ZP) - movq %rdx, 40(%ZP) - movq 48(%TP), %rax - movq 56(%TP), %rdx - movq %rax, 48(%ZP) - movq %rdx, 56(%ZP) - movq 64(%TP), %rax - movq %rax, 64(%ZP) - - movl %CYl, %eax # use carry as return value - addq $80, %rsp -ifdef(`WINDOWS64_ABI', -` popq %rdi - popq %rsi -') dnl - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc.m4 gmp-ecm-7.0.5+ds/x86_64/mulredc.m4 --- gmp-ecm-7.0.4+ds/x86_64/mulredc.m4 2016-08-25 13:42:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc.m4 2022-06-06 14:16:49.000000000 +0000 @@ -374,3 +374,11 @@ popq %rbp popq %rbx ret + +`ifdef(`WINDOWS64_ABI', +, +` +`#'if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +`#'endif +') dnl' diff -Nru gmp-ecm-7.0.4+ds/x86_64/mulredc.s gmp-ecm-7.0.5+ds/x86_64/mulredc.s --- gmp-ecm-7.0.4+ds/x86_64/mulredc.s 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/mulredc.s 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,165 @@ +# +# mp_limb_t mulredc(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, +# const mp_limb_t *m, mp_size_t n, mp_limb_t inv_m) +# +# Compute z := x*y mod m, in Montgomery representation, where x, y < m +# and m is n limb wide. inv_m is the less significant limb of the +# inverse of m modulo 2^(n*GMP_LIMB_BITS) +# +# The result might be unreduced (larger than m) but becomes reduced +# after subtracting m. The calling function should take care of that. +# +# We use a temporary space for unreduced product on the stack. +# Therefore, this can not be used for large integers (anyway, the +# algorithm is quadratic). +# +# WARNING: z is only n limbs but since it might be unreduced, there +# could be a carry that does not fit in z. This carry is returned. + + + .text + .globl mulredc + .type mulredc,@function +mulredc: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 # push registers of caller + movq %r8, %rax + shlq $4, %rax + addq $8, %rax + subq %rax, %rsp # allocate TMP on stack + movq %rsp, %r10 # r10 <- TMP + pushq %r10 # pointer to TMP + pushq %rdi # push parameters + pushq %rsi + pushq %rdx + pushq %rcx + +# Stack: Other parameters: +# (%rsp) : m %r8 : n +# 8(%rsp) : y %r9 : inv_m +# 16(%rsp) : x +# 24(%rsp) : z +# 32(%rsp) : TMP (pointer to) +# 40+%rsp : TMP (first limb) + +### Init: put 0 in TMP[0..2n] +############################# + movq %r8, %rcx + shlq $1, %rcx + incq %rcx + movq 32(%rsp), %rdi +InitLoop: + movq $0, (%rdi) + addq $8, %rdi + decq %rcx + jnz InitLoop + +### Outer: +############################## +### for (i = 0; i < n; ++i) { +### u = (TMP[0]+x[i]*y[0])*inv_m mod 2^GMP_LIMB_BITS; +### TMP[0..n+1] += x[i]*y + u*m; // this is "Inner" +### TMP++; +### } +############################## + movq %r8, %r15 +OuterLoop: + ## compute u + movq 16(%rsp), %rbx + movq (%rbx), %rax + movq 8(%rsp), %rbx + mulq (%rbx) + movq 32(%rsp), %rbx + addq (%rbx), %rax + mulq %r9 + movq %rax, %r10 # %r10 gets the multiplier u + + ### Inner: + ###################### + ### TMP[0..n+1] += x[i]*y + u*m + ### + ### ax: ... r8 : n (for outer) + ### bx: m r9 : inv_m (for outer) + ### cx: inner cpt r10: u + ### dx: ... r11: x[i] + ### di: TMP r12, r13, r14: ... + ### si: y r15: outer cpt + ###################### + movq (%rsp), %rbx + movq 32(%rsp), %rdi + movq 8(%rsp), %rsi + movq 16(%rsp), %rdx + movq (%rdx), %r11 + movq %r8, %rcx + xorq %r12, %r12 + xorq %r13, %r13 + InnerLoop: ### r12: carry lo, r13: carry hi (can be at most 2) + movq (%rsi), %rax + mulq %r11 + addq %rax, %r12 + adcq $0, %rdx + addq %r12, (%rdi) + adcq $0, %rdx ## carry flag is clean + movq %rdx, %r14 + movq (%rbx), %rax + mulq %r10 + movq %r13, %r12 ## carry hi becomes carry low + xorq %r13, %r13 + addq %rax, (%rdi) + adcq %rdx, %r12 + adcq $0, %r13 + addq %r14, %r12 + adcq $0, %r13 + + addq $8, %rdi + addq $8, %rsi + addq $8, %rbx + decq %rcx + jnz InnerLoop + addq %r12, (%rdi) + adcq %r13, 8(%rdi) + ###################### + + ## advance TMP and x + movq 32(%rsp), %rdx + addq $8, %rdx + movq %rdx, 32(%rsp) + movq 16(%rsp), %rdx + addq $8, %rdx + movq %rdx, 16(%rsp) + decq %r15 + jnz OuterLoop + +### Finish: +############################## +### Copy TMP into z +############################## + movq %r8, %rcx + movq 32(%rsp), %rsi + movq 24(%rsp), %rdi +FinishLoop: + movq (%rsi), %rax + movq %rax, (%rdi) + addq $8, %rdi + addq $8, %rsi + decq %rcx + jnz FinishLoop + + + addq $40, %rsp # clean parameters + movq %r8, %rax + shlq $4, %rax + addq $8, %rax + addq %rax, %rsp # free TMP on stack + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + + movq (%rsi), %rax ## returned value + ret diff -Nru gmp-ecm-7.0.4+ds/x86_64/toto gmp-ecm-7.0.5+ds/x86_64/toto --- gmp-ecm-7.0.4+ds/x86_64/toto 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/x86_64/toto 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,75 @@ +************************ +mulredc1 + +x, y, z, m : taille 1 + +tmp: taille 2 + +u = x*y*invm +tmp[0..2[ = x*y + u*m -> carry cy retournée +z = tmp[1] + +************************ +mulredc2 + +u0 = x[0]*y[0]*invm +tmp[0..3[ = x[0]*y +tmp[0..3[ += u0*m -> carry in tmp[3] + +u1 = x[1]*y[0]*invm + tmp[1] +tmp[1..4[ += x[1]*y -> carry??? +tmp[1..4[ += u1*m -> carry cy retournée +z[0..1[ = tmp[0..1[ + + +######################################################### + +z0:z1:z2:.... +x0:x1:x2:.... +y0:y1:y2:.... + +z +:= u*x + v*y + +Pour x: pour y: + cy.hi dy.lo + cy.lo dy.hi + + z[i] z[i+1] z[i+2] z[i+3] + dy.lo dy.hi dy.HI + + y[i+3]*v + rax rdx + + cy.lo cy.hi cy.HI + + x[i+1]*u + rax rdx + +MOV x[i+1] -> rax +MUL u -> rax:rdx +ADD cy.lo -> z[i] +ADC rax -> cy.hi +MOV rdx -> cy.HI + +### ADC 0 -> cy.HI + +MOV y[i+3] -> rax +MUL v -> rax:rdx +ADC dy.lo -> z[i+2] +ADC rax -> dy.hi +MOV rdx -> dy.HI +ADC 0 -> dy.HI + + +Etat initial: + rsp: &z[0] + rsi: &x[0] r8: u + r10: &y[0] r9: v + +registres libres: rax, rdx + rbx: cy.lo + rcx: cy.hi + r11: dy.lo + r12: dy.hi + + diff -Nru gmp-ecm-7.0.4+ds/Z2102.n gmp-ecm-7.0.5+ds/Z2102.n --- gmp-ecm-7.0.4+ds/Z2102.n 1970-01-01 00:00:00.000000000 +0000 +++ gmp-ecm-7.0.5+ds/Z2102.n 2022-06-06 14:16:49.000000000 +0000 @@ -0,0 +1,3 @@ +# The following number = 71^1135-48, small p14 factor can be found with: ecm -sigma 0:4702936202311981741 2000 +15073905772355547661192963644910775215563737808813147567308205100003073649126539370886232771931461538509207798635222308620476733399070158090814804768408973814344139577477528322353302736858679647926449045585951449030328023537610794755550244046701053951753786713452840010934298684519035608994466327407045641644168261576072622182328151238781596641560072998408684630137040233976925077321447561841990098514262094032269082942922061857299341290225885081075865327019376002042859066568226731219562889780013624602831020168487867657307204740780056640235557227291790074480962178886722304704871505346761752346301913697046495553877634300628913896447548437456112561352456783207825274344061682364641138020512347797237886904334129685391218667358582368049734188169185918442874405602419597479031087653719446280644849201285538267868108558089655308335440318704415635756980889924645921817760941541305843719440185726405822220724975989214982702817564727063569514363671924226427133902478102952733100124361995938313074119664790358944475805278524716227619513159313636635781728423827593259161694489815282645162292947890342328864334411770105102935128630288944705817950067072024901062731381559588129216995590484691226073228273513528042819481145384215888249447574857634563196327970163077214509373191979568227895752398416043189540750130015447252815295104920079053523491585669984066616094835005307169676875695402659666996245862125991253276128864044013063929328177874602635541783162010128279116864864394126254896709196557855290547013515324754462958551723903718083832983116538023247611214736947670163356949466141250522340158170691330818281029353540728898770391526246988246470852894139600307057117485373674513158753972902234312262752906889650126297335669499354291432979078853151658594710515888811746468043346683755130747000257055655461706120511662024112207882592206834515641533333086460793113870074334861019429984961997738816603128537805967854706978136014432832088967903584445783009323062478414419884457758223397052404459878435133057290510961559040168908138290806586866583614076715728159306559319446232567383457974490947696901232382984903 +