#pragma once /** @file @brief generic function for each N @author MITSUNARI Shigeo(@herumi) @license modified new BSD license http://opensource.org/licenses/BSD-3-Clause */ #include #include #include #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4127) #endif namespace mcl { namespace fp { struct Gtag; // GMP struct Ltag; // LLVM struct LBMI2tag; // LLVM with Intel BMI2 instruction struct Atag; // asm template struct TagToStr { }; template<> struct TagToStr { static const char *f() { return "Gtag"; } }; template<> struct TagToStr { static const char *f() { return "Ltag"; } }; template<> struct TagToStr { static const char *f() { return "LBMI2tag"; } }; template<> struct TagToStr { static const char *f() { return "Atag"; } }; template void clearC(Unit *x) { clearArray(x, 0, N); } template bool isZeroC(const Unit *x) { return isZeroArray(x, N); } template void copyC(Unit *y, const Unit *x) { copyArray(y, x, N); } // (carry, z[N]) <- x[N] + y[N] template struct AddPre { static inline Unit func(Unit *z, const Unit *x, const Unit *y) { #ifdef MCL_USE_VINT return mcl::vint::addN(z, x, y, N); #else return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N); #endif } static const u3u f; }; template const u3u AddPre::f = AddPre::func; // (carry, x[N]) <- x[N] + y template struct AddUnitPre { static inline Unit func(Unit *x, Unit n, Unit y) { #if 1 int ret = 0; Unit t = x[0] + y; x[0] = t; if (t >= y) goto EXIT_0; for (size_t i = 1; i < n; i++) { t = x[i] + 1; x[i] = t; if (t != 0) goto EXIT_0; } ret = 1; EXIT_0: return ret; #else return mpn_add_1((mp_limb_t*)x, (const mp_limb_t*)x, (int)n, y); #endif } static const u1uII f; }; template const u1uII AddUnitPre::f = AddUnitPre::func; // (carry, z[N]) <- x[N] - y[N] template struct SubPre { static inline Unit func(Unit *z, const Unit *x, const Unit *y) { #ifdef MCL_USE_VINT return mcl::vint::subN(z, x, y, N); #else return mpn_sub_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N); #endif } static const u3u f; }; template const u3u SubPre::f = SubPre::func; // y[N] <- (x[N] >> 1) template struct Shr1 { static inline void func(Unit *y, const Unit *x) { #ifdef MCL_USE_VINT mcl::vint::shrN(y, x, N, 1); #else mpn_rshift((mp_limb_t*)y, (const mp_limb_t*)x, (int)N, 1); #endif } static const void2u f; }; template const void2u Shr1::f = Shr1::func; // y[N] <- (-x[N]) % p[N] template struct Neg { static inline void func(Unit *y, const Unit *x, const Unit *p) { if (isZeroC(x)) { if (x != y) clearC(y); return; } SubPre::f(y, p, x); } static const void3u f; }; template const void3u Neg::f = Neg::func; // z[N * 2] <- x[N] * y[N] template struct MulPreCore { static inline void func(Unit *z, const Unit *x, const Unit *y) { #ifdef MCL_USE_VINT mcl::vint::mulNM(z, x, N, y, N); #else mpn_mul_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, (int)N); #endif } static const void3u f; }; template const void3u MulPreCore::f = MulPreCore::func; template struct EnableKaratsuba { /* always use mpn* for Gtag */ static const size_t minMulN = 100; static const size_t minSqrN = 100; }; template struct MulPre { /* W = 1 << H x = aW + b, y = cW + d xy = acW^2 + (ad + bc)W + bd ad + bc = (a + b)(c + d) - ac - bd */ static inline void karatsuba(Unit *z, const Unit *x, const Unit *y) { const size_t H = N / 2; MulPre::f(z, x, y); // bd MulPre::f(z + N, x + H, y + H); // ac Unit a_b[H]; Unit c_d[H]; Unit c1 = AddPre::f(a_b, x, x + H); // a + b Unit c2 = AddPre::f(c_d, y, y + H); // c + d Unit tmp[N]; MulPre::f(tmp, a_b, c_d); Unit c = c1 & c2; if (c1) { c += AddPre::f(tmp + H, tmp + H, c_d); } if (c2) { c += AddPre::f(tmp + H, tmp + H, a_b); } // c:tmp[N] = (a + b)(c + d) c -= SubPre::f(tmp, tmp, z); c -= SubPre::f(tmp, tmp, z + N); // c:tmp[N] = ad + bc c += AddPre::f(z + H, z + H, tmp); assert(c <= 2); if (c) { AddUnitPre::f(z + N + H, H, c); } } static inline void func(Unit *z, const Unit *x, const Unit *y) { #if 1 if (N >= EnableKaratsuba::minMulN && (N % 2) == 0) { karatsuba(z, x, y); return; } #endif MulPreCore::f(z, x, y); } static const void3u f; }; template const void3u MulPre::f = MulPre::func; template struct MulPre<0, Tag> { static inline void f(Unit*, const Unit*, const Unit*) {} }; template struct MulPre<1, Tag> { static inline void f(Unit* z, const Unit* x, const Unit* y) { MulPreCore<1, Tag>::f(z, x, y); } }; // z[N * 2] <- x[N] * x[N] template struct SqrPreCore { static inline void func(Unit *y, const Unit *x) { #ifdef MCL_USE_VINT mcl::vint::sqrN(y, x, N); #else mpn_sqr((mp_limb_t*)y, (const mp_limb_t*)x, N); #endif } static const void2u f; }; template const void2u SqrPreCore::f = SqrPreCore::func; template struct SqrPre { /* W = 1 << H x = aW + b x^2 = aaW^2 + 2abW + bb */ static inline void karatsuba(Unit *z, const Unit *x) { const size_t H = N / 2; SqrPre::f(z, x); // b^2 SqrPre::f(z + N, x + H); // a^2 Unit ab[N]; MulPre::f(ab, x, x + H); // ab Unit c = AddPre::f(ab, ab, ab); c += AddPre::f(z + H, z + H, ab); if (c) { AddUnitPre::f(z + N + H, H, c); } } static inline void func(Unit *y, const Unit *x) { #if 1 if (N >= EnableKaratsuba::minSqrN && (N % 2) == 0) { karatsuba(y, x); return; } #endif SqrPreCore::f(y, x); } static const void2u f; }; template const void2u SqrPre::f = SqrPre::func; template struct SqrPre<0, Tag> { static inline void f(Unit*, const Unit*) {} }; template struct SqrPre<1, Tag> { static inline void f(Unit* y, const Unit* x) { SqrPreCore<1, Tag>::f(y, x); } }; // z[N + 1] <- x[N] * y template struct MulUnitPre { static inline void func(Unit *z, const Unit *x, Unit y) { #ifdef MCL_USE_VINT z[N] = mcl::vint::mulu1(z, x, N, y); #else z[N] = mpn_mul_1((mp_limb_t*)z, (const mp_limb_t*)x, N, y); #endif } static const void2uI f; }; template const void2uI MulUnitPre::f = MulUnitPre::func; // z[N] <- x[N + 1] % p[N] template struct N1_Mod { static inline void func(Unit *y, const Unit *x, const Unit *p) { #ifdef MCL_USE_VINT mcl::vint::divNM(0, 0, y, x, N + 1, p, N); #else mp_limb_t q[2]; // not used mpn_tdiv_qr(q, (mp_limb_t*)y, 0, (const mp_limb_t*)x, N + 1, (const mp_limb_t*)p, N); #endif } static const void3u f; }; template const void3u N1_Mod::f = N1_Mod::func; // z[N] <- (x[N] * y) % p[N] template struct MulUnit { static inline void func(Unit *z, const Unit *x, Unit y, const Unit *p) { Unit xy[N + 1]; MulUnitPre::f(xy, x, y); #if 1 Unit len = UnitBitSize - 1 - cybozu::bsr(p[N - 1]); Unit v = xy[N]; if (N > 1 && len < 3 && v < 0xff) { for (;;) { if (len == 0) { v = xy[N]; } else { v = (xy[N] << len) | (xy[N - 1] >> (UnitBitSize - len)); } if (v == 0) break; if (v == 1) { xy[N] -= SubPre::f(xy, xy, p); } else { Unit t[N + 1]; MulUnitPre::f(t, p, v); SubPre::f(xy, xy, t); } } for (;;) { if (SubPre::f(z, xy, p)) { copyC(z, xy); return; } if (SubPre::f(xy, z, p)) { return; } } } #endif N1_Mod::f(z, xy, p); } static const void2uIu f; }; template const void2uIu MulUnit::f = MulUnit::func; // z[N] <- x[N * 2] % p[N] template struct Dbl_Mod { static inline void func(Unit *y, const Unit *x, const Unit *p) { #ifdef MCL_USE_VINT mcl::vint::divNM(0, 0, y, x, N * 2, p, N); #else mp_limb_t q[N + 1]; // not used mpn_tdiv_qr(q, (mp_limb_t*)y, 0, (const mp_limb_t*)x, N * 2, (const mp_limb_t*)p, N); #endif } static const void3u f; }; template const void3u Dbl_Mod::f = Dbl_Mod::func; template struct SubIfPossible { static inline void f(Unit *z, const Unit *p) { Unit tmp[N - 1]; if (SubPre::f(tmp, z, p) == 0) { copyC(z, tmp); z[N - 1] = 0; } } }; template struct SubIfPossible<1, Tag> { static inline void f(Unit *, const Unit *) { } }; // z[N] <- (x[N] + y[N]) % p[N] template struct Add { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { if (isFullBit) { if (AddPre::f(z, x, y)) { SubPre::f(z, z, p); return; } Unit tmp[N]; if (SubPre::f(tmp, z, p) == 0) { copyC(z, tmp); } } else { AddPre::f(z, x, y); Unit a = z[N - 1]; Unit b = p[N - 1]; if (a < b) return; if (a > b) { SubPre::f(z, z, p); return; } /* the top of z and p are same */ SubIfPossible::f(z, p); } } static const void4u f; }; template const void4u Add::f = Add::func; // z[N] <- (x[N] - y[N]) % p[N] template struct Sub { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { if (SubPre::f(z, x, y)) { AddPre::f(z, z, p); } } static const void4u f; }; template const void4u Sub::f = Sub::func; // z[N * 2] <- (x[N * 2] + y[N * 2]) mod p[N] << (N * UnitBitSize) template struct DblAdd { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { if (AddPre::f(z, x, y)) { SubPre::f(z + N, z + N, p); return; } Unit tmp[N]; if (SubPre::f(tmp, z + N, p) == 0) { memcpy(z + N, tmp, sizeof(tmp)); } } static const void4u f; }; template const void4u DblAdd::f = DblAdd::func; // z[N * 2] <- (x[N * 2] - y[N * 2]) mod p[N] << (N * UnitBitSize) template struct DblSub { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { if (SubPre::f(z, x, y)) { AddPre::f(z + N, z + N, p); } } static const void4u f; }; template const void4u DblSub::f = DblSub::func; /* z[N] <- montRed(xy[N * 2], p[N]) REMARK : assume p[-1] = rp */ template struct MontRed { static inline void func(Unit *z, const Unit *xy, const Unit *p) { const Unit rp = p[-1]; Unit pq[N + 1]; Unit buf[N * 2 + 1]; copyC(buf + N + 1, xy + N + 1); buf[N * 2] = 0; Unit q = xy[0] * rp; MulUnitPre::f(pq, p, q); Unit up = AddPre::f(buf, xy, pq); if (up) { buf[N * 2] = AddUnitPre::f(buf + N + 1, N - 1, 1); } Unit *c = buf + 1; for (size_t i = 1; i < N; i++) { q = c[0] * rp; MulUnitPre::f(pq, p, q); up = AddPre::f(c, c, pq); if (up) { AddUnitPre::f(c + N + 1, N - i, 1); } c++; } if (c[N]) { SubPre::f(z, c, p); } else { if (SubPre::f(z, c, p)) { memcpy(z, c, N * sizeof(Unit)); } } } static const void3u f; }; template const void3u MontRed::f = MontRed::func; /* z[N] <- Montgomery(x[N], y[N], p[N]) REMARK : assume p[-1] = rp */ template struct Mont { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { #if MCL_MAX_BIT_SIZE == 1024 || MCL_SIZEOF_UNIT == 4 // check speed Unit xy[N * 2]; MulPre::f(xy, x, y); MontRed::f(z, xy, p); #else const Unit rp = p[-1]; if (isFullBit) { Unit buf[N * 2 + 2]; Unit *c = buf; MulUnitPre::f(c, x, y[0]); // x * y[0] Unit q = c[0] * rp; Unit t[N + 2]; MulUnitPre::f(t, p, q); // p * q t[N + 1] = 0; // always zero c[N + 1] = AddPre::f(c, c, t); c++; for (size_t i = 1; i < N; i++) { MulUnitPre::f(t, x, y[i]); c[N + 1] = AddPre::f(c, c, t); q = c[0] * rp; MulUnitPre::f(t, p, q); AddPre::f(c, c, t); c++; } if (c[N]) { SubPre::f(z, c, p); } else { if (SubPre::f(z, c, p)) { memcpy(z, c, N * sizeof(Unit)); } } } else { /* R = 1 << 64 L % 64 = 63 ; not full bit F = 1 << (L + 1) max p = (1 << L) - 1 x, y <= p - 1 max x * y[0], p * q <= ((1 << L) - 1)(R - 1) t = x * y[i] + p * q <= 2((1 << L) - 1)(R - 1) = (F - 2)(R - 1) t >> 64 <= (F - 2)(R - 1)/R = (F - 2) - (F - 2)/R t + (t >> 64) = (F - 2)R - (F - 2)/R < FR */ Unit carry; (void)carry; Unit buf[N * 2 + 1]; Unit *c = buf; MulUnitPre::f(c, x, y[0]); // x * y[0] Unit q = c[0] * rp; Unit t[N + 1]; MulUnitPre::f(t, p, q); // p * q carry = AddPre::f(c, c, t); assert(carry == 0); c++; c[N] = 0; for (size_t i = 1; i < N; i++) { c[N + 1] = 0; MulUnitPre::f(t, x, y[i]); carry = AddPre::f(c, c, t); assert(carry == 0); q = c[0] * rp; MulUnitPre::f(t, p, q); carry = AddPre::f(c, c, t); assert(carry == 0); c++; } assert(c[N] == 0); if (SubPre::f(z, c, p)) { memcpy(z, c, N * sizeof(Unit)); } } #endif } static const void4u f; }; template const void4u Mont::f = Mont::func; // z[N] <- Montgomery(x[N], x[N], p[N]) template struct SqrMont { static inline void func(Unit *y, const Unit *x, const Unit *p) { #if MCL_MAX_BIT_SIZE == 1024 || MCL_SIZEOF_UNIT == 4 // check speed Unit xx[N * 2]; SqrPre::f(xx, x); MontRed::f(y, xx, p); #else Mont::f(y, x, x, p); #endif } static const void3u f; }; template const void3u SqrMont::f = SqrMont::func; // z[N] <- (x[N] * y[N]) % p[N] template struct Mul { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { Unit xy[N * 2]; MulPre::f(xy, x, y); Dbl_Mod::f(z, xy, p); } static const void4u f; }; template const void4u Mul::f = Mul::func; // y[N] <- (x[N] * x[N]) % p[N] template struct Sqr { static inline void func(Unit *y, const Unit *x, const Unit *p) { Unit xx[N * 2]; SqrPre::f(xx, x); Dbl_Mod::f(y, xx, p); } static const void3u f; }; template const void3u Sqr::f = Sqr::func; template struct Fp2MulNF { static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p) { const Unit *const a = x; const Unit *const b = x + N; const Unit *const c = y; const Unit *const d = y + N; Unit d0[N * 2]; Unit d1[N * 2]; Unit d2[N * 2]; Unit s[N]; Unit t[N]; AddPre::f(s, a, b); AddPre::f(t, c, d); MulPre::f(d0, s, t); MulPre::f(d1, a, c); MulPre::f(d2, b, d); SubPre::f(d0, d0, d1); SubPre::f(d0, d0, d2); MontRed::f(z + N, d0, p); DblSub::f(d1, d1, d2, p); MontRed::f(z, d1, p); } static const void4u f; }; template const void4u Fp2MulNF::f = Fp2MulNF::func; } } // mcl::fp #ifdef _MSC_VER #pragma warning(pop) #endif