#pragma once
/**
@file
@brief generic function for each N
@author MITSUNARI Shigeo(@herumi)
@license modified new BSD license
http://opensource.org/licenses/BSD-3-Clause
*/
#include <mcl/op.hpp>
#include <mcl/util.hpp>
#include <cybozu/bit_operation.hpp>
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4127)
#endif
namespace mcl { namespace fp {
struct Gtag; // GMP
struct Ltag; // LLVM
struct LBMI2tag; // LLVM with Intel BMI2 instruction
struct Atag; // asm
template<class Tag> struct TagToStr { };
template<> struct TagToStr<Gtag> { static const char *f() { return "Gtag"; } };
template<> struct TagToStr<Ltag> { static const char *f() { return "Ltag"; } };
template<> struct TagToStr<LBMI2tag> { static const char *f() { return "LBMI2tag"; } };
template<> struct TagToStr<Atag> { static const char *f() { return "Atag"; } };
template<size_t N>
void clearC(Unit *x)
{
clearArray(x, 0, N);
}
template<size_t N>
bool isZeroC(const Unit *x)
{
return isZeroArray(x, N);
}
template<size_t N>
void copyC(Unit *y, const Unit *x)
{
copyArray(y, x, N);
}
// (carry, z[N]) <- x[N] + y[N]
template<size_t N, class Tag = Gtag>
struct AddPre {
static inline Unit func(Unit *z, const Unit *x, const Unit *y)
{
#ifdef MCL_USE_VINT
return mcl::vint::addN(z, x, y, N);
#else
return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N);
#endif
}
static const u3u f;
};
template<size_t N, class Tag>
const u3u AddPre<N, Tag>::f = AddPre<N, Tag>::func;
// (carry, x[N]) <- x[N] + y
template<class Tag = Gtag>
struct AddUnitPre {
static inline Unit func(Unit *x, Unit n, Unit y)
{
#if 1
int ret = 0;
Unit t = x[0] + y;
x[0] = t;
if (t >= y) goto EXIT_0;
for (size_t i = 1; i < n; i++) {
t = x[i] + 1;
x[i] = t;
if (t != 0) goto EXIT_0;
}
ret = 1;
EXIT_0:
return ret;
#else
return mpn_add_1((mp_limb_t*)x, (const mp_limb_t*)x, (int)n, y);
#endif
}
static const u1uII f;
};
template<class Tag>
const u1uII AddUnitPre<Tag>::f = AddUnitPre<Tag>::func;
// (carry, z[N]) <- x[N] - y[N]
template<size_t N, class Tag = Gtag>
struct SubPre {
static inline Unit func(Unit *z, const Unit *x, const Unit *y)
{
#ifdef MCL_USE_VINT
return mcl::vint::subN(z, x, y, N);
#else
return mpn_sub_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N);
#endif
}
static const u3u f;
};
template<size_t N, class Tag>
const u3u SubPre<N, Tag>::f = SubPre<N, Tag>::func;
// y[N] <- (x[N] >> 1)
template<size_t N, class Tag = Gtag>
struct Shr1 {
static inline void func(Unit *y, const Unit *x)
{
#ifdef MCL_USE_VINT
mcl::vint::shrN(y, x, N, 1);
#else
mpn_rshift((mp_limb_t*)y, (const mp_limb_t*)x, (int)N, 1);
#endif
}
static const void2u f;
};
template<size_t N, class Tag>
const void2u Shr1<N, Tag>::f = Shr1<N, Tag>::func;
// y[N] <- (-x[N]) % p[N]
template<size_t N, class Tag = Gtag>
struct Neg {
static inline void func(Unit *y, const Unit *x, const Unit *p)
{
if (isZeroC<N>(x)) {
if (x != y) clearC<N>(y);
return;
}
SubPre<N, Tag>::f(y, p, x);
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u Neg<N, Tag>::f = Neg<N, Tag>::func;
// z[N * 2] <- x[N] * y[N]
template<size_t N, class Tag = Gtag>
struct MulPreCore {
static inline void func(Unit *z, const Unit *x, const Unit *y)
{
#ifdef MCL_USE_VINT
mcl::vint::mulNM(z, x, N, y, N);
#else
mpn_mul_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, (int)N);
#endif
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u MulPreCore<N, Tag>::f = MulPreCore<N, Tag>::func;
template<class Tag = Gtag>
struct EnableKaratsuba {
/* always use mpn* for Gtag */
static const size_t minMulN = 100;
static const size_t minSqrN = 100;
};
template<size_t N, class Tag = Gtag>
struct MulPre {
/*
W = 1 << H
x = aW + b, y = cW + d
xy = acW^2 + (ad + bc)W + bd
ad + bc = (a + b)(c + d) - ac - bd
*/
static inline void karatsuba(Unit *z, const Unit *x, const Unit *y)
{
const size_t H = N / 2;
MulPre<H, Tag>::f(z, x, y); // bd
MulPre<H, Tag>::f(z + N, x + H, y + H); // ac
Unit a_b[H];
Unit c_d[H];
Unit c1 = AddPre<H, Tag>::f(a_b, x, x + H); // a + b
Unit c2 = AddPre<H, Tag>::f(c_d, y, y + H); // c + d
Unit tmp[N];
MulPre<H, Tag>::f(tmp, a_b, c_d);
Unit c = c1 & c2;
if (c1) {
c += AddPre<H, Tag>::f(tmp + H, tmp + H, c_d);
}
if (c2) {
c += AddPre<H, Tag>::f(tmp + H, tmp + H, a_b);
}
// c:tmp[N] = (a + b)(c + d)
c -= SubPre<N, Tag>::f(tmp, tmp, z);
c -= SubPre<N, Tag>::f(tmp, tmp, z + N);
// c:tmp[N] = ad + bc
c += AddPre<N, Tag>::f(z + H, z + H, tmp);
assert(c <= 2);
if (c) {
AddUnitPre<Tag>::f(z + N + H, H, c);
}
}
static inline void func(Unit *z, const Unit *x, const Unit *y)
{
#if 1
if (N >= EnableKaratsuba<Tag>::minMulN && (N % 2) == 0) {
karatsuba(z, x, y);
return;
}
#endif
MulPreCore<N, Tag>::f(z, x, y);
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u MulPre<N, Tag>::f = MulPre<N, Tag>::func;
template<class Tag>
struct MulPre<0, Tag> {
static inline void f(Unit*, const Unit*, const Unit*) {}
};
template<class Tag>
struct MulPre<1, Tag> {
static inline void f(Unit* z, const Unit* x, const Unit* y)
{
MulPreCore<1, Tag>::f(z, x, y);
}
};
// z[N * 2] <- x[N] * x[N]
template<size_t N, class Tag = Gtag>
struct SqrPreCore {
static inline void func(Unit *y, const Unit *x)
{
#ifdef MCL_USE_VINT
mcl::vint::sqrN(y, x, N);
#else
mpn_sqr((mp_limb_t*)y, (const mp_limb_t*)x, N);
#endif
}
static const void2u f;
};
template<size_t N, class Tag>
const void2u SqrPreCore<N, Tag>::f = SqrPreCore<N, Tag>::func;
template<size_t N, class Tag = Gtag>
struct SqrPre {
/*
W = 1 << H
x = aW + b
x^2 = aaW^2 + 2abW + bb
*/
static inline void karatsuba(Unit *z, const Unit *x)
{
const size_t H = N / 2;
SqrPre<H, Tag>::f(z, x); // b^2
SqrPre<H, Tag>::f(z + N, x + H); // a^2
Unit ab[N];
MulPre<H, Tag>::f(ab, x, x + H); // ab
Unit c = AddPre<N, Tag>::f(ab, ab, ab);
c += AddPre<N, Tag>::f(z + H, z + H, ab);
if (c) {
AddUnitPre<Tag>::f(z + N + H, H, c);
}
}
static inline void func(Unit *y, const Unit *x)
{
#if 1
if (N >= EnableKaratsuba<Tag>::minSqrN && (N % 2) == 0) {
karatsuba(y, x);
return;
}
#endif
SqrPreCore<N, Tag>::f(y, x);
}
static const void2u f;
};
template<size_t N, class Tag>
const void2u SqrPre<N, Tag>::f = SqrPre<N, Tag>::func;
template<class Tag>
struct SqrPre<0, Tag> {
static inline void f(Unit*, const Unit*) {}
};
template<class Tag>
struct SqrPre<1, Tag> {
static inline void f(Unit* y, const Unit* x)
{
SqrPreCore<1, Tag>::f(y, x);
}
};
// z[N + 1] <- x[N] * y
template<size_t N, class Tag = Gtag>
struct MulUnitPre {
static inline void func(Unit *z, const Unit *x, Unit y)
{
#ifdef MCL_USE_VINT
z[N] = mcl::vint::mulu1(z, x, N, y);
#else
z[N] = mpn_mul_1((mp_limb_t*)z, (const mp_limb_t*)x, N, y);
#endif
}
static const void2uI f;
};
template<size_t N, class Tag>
const void2uI MulUnitPre<N, Tag>::f = MulUnitPre<N, Tag>::func;
// z[N] <- x[N + 1] % p[N]
template<size_t N, class Tag = Gtag>
struct N1_Mod {
static inline void func(Unit *y, const Unit *x, const Unit *p)
{
#ifdef MCL_USE_VINT
mcl::vint::divNM<Unit>(0, 0, y, x, N + 1, p, N);
#else
mp_limb_t q[2]; // not used
mpn_tdiv_qr(q, (mp_limb_t*)y, 0, (const mp_limb_t*)x, N + 1, (const mp_limb_t*)p, N);
#endif
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u N1_Mod<N, Tag>::f = N1_Mod<N, Tag>::func;
// z[N] <- (x[N] * y) % p[N]
template<size_t N, class Tag = Gtag>
struct MulUnit {
static inline void func(Unit *z, const Unit *x, Unit y, const Unit *p)
{
Unit xy[N + 1];
MulUnitPre<N, Tag>::f(xy, x, y);
#if 1
Unit len = UnitBitSize - 1 - cybozu::bsr(p[N - 1]);
Unit v = xy[N];
if (N > 1 && len < 3 && v < 0xff) {
for (;;) {
if (len == 0) {
v = xy[N];
} else {
v = (xy[N] << len) | (xy[N - 1] >> (UnitBitSize - len));
}
if (v == 0) break;
if (v == 1) {
xy[N] -= SubPre<N, Tag>::f(xy, xy, p);
} else {
Unit t[N + 1];
MulUnitPre<N, Tag>::f(t, p, v);
SubPre<N + 1, Tag>::f(xy, xy, t);
}
}
for (;;) {
if (SubPre<N, Tag>::f(z, xy, p)) {
copyC<N>(z, xy);
return;
}
if (SubPre<N, Tag>::f(xy, z, p)) {
return;
}
}
}
#endif
N1_Mod<N, Tag>::f(z, xy, p);
}
static const void2uIu f;
};
template<size_t N, class Tag>
const void2uIu MulUnit<N, Tag>::f = MulUnit<N, Tag>::func;
// z[N] <- x[N * 2] % p[N]
template<size_t N, class Tag = Gtag>
struct Dbl_Mod {
static inline void func(Unit *y, const Unit *x, const Unit *p)
{
#ifdef MCL_USE_VINT
mcl::vint::divNM<Unit>(0, 0, y, x, N * 2, p, N);
#else
mp_limb_t q[N + 1]; // not used
mpn_tdiv_qr(q, (mp_limb_t*)y, 0, (const mp_limb_t*)x, N * 2, (const mp_limb_t*)p, N);
#endif
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u Dbl_Mod<N, Tag>::f = Dbl_Mod<N, Tag>::func;
template<size_t N, class Tag>
struct SubIfPossible {
static inline void f(Unit *z, const Unit *p)
{
Unit tmp[N - 1];
if (SubPre<N - 1, Tag>::f(tmp, z, p) == 0) {
copyC<N - 1>(z, tmp);
z[N - 1] = 0;
}
}
};
template<class Tag>
struct SubIfPossible<1, Tag> {
static inline void f(Unit *, const Unit *)
{
}
};
// z[N] <- (x[N] + y[N]) % p[N]
template<size_t N, bool isFullBit, class Tag = Gtag>
struct Add {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
if (isFullBit) {
if (AddPre<N, Tag>::f(z, x, y)) {
SubPre<N, Tag>::f(z, z, p);
return;
}
Unit tmp[N];
if (SubPre<N, Tag>::f(tmp, z, p) == 0) {
copyC<N>(z, tmp);
}
} else {
AddPre<N, Tag>::f(z, x, y);
Unit a = z[N - 1];
Unit b = p[N - 1];
if (a < b) return;
if (a > b) {
SubPre<N, Tag>::f(z, z, p);
return;
}
/* the top of z and p are same */
SubIfPossible<N, Tag>::f(z, p);
}
}
static const void4u f;
};
template<size_t N, bool isFullBit, class Tag>
const void4u Add<N, isFullBit, Tag>::f = Add<N, isFullBit, Tag>::func;
// z[N] <- (x[N] - y[N]) % p[N]
template<size_t N, bool isFullBit, class Tag = Gtag>
struct Sub {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
if (SubPre<N, Tag>::f(z, x, y)) {
AddPre<N, Tag>::f(z, z, p);
}
}
static const void4u f;
};
template<size_t N, bool isFullBit, class Tag>
const void4u Sub<N, isFullBit, Tag>::f = Sub<N, isFullBit, Tag>::func;
// z[N * 2] <- (x[N * 2] + y[N * 2]) mod p[N] << (N * UnitBitSize)
template<size_t N, class Tag = Gtag>
struct DblAdd {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
if (AddPre<N * 2, Tag>::f(z, x, y)) {
SubPre<N, Tag>::f(z + N, z + N, p);
return;
}
Unit tmp[N];
if (SubPre<N, Tag>::f(tmp, z + N, p) == 0) {
memcpy(z + N, tmp, sizeof(tmp));
}
}
static const void4u f;
};
template<size_t N, class Tag>
const void4u DblAdd<N, Tag>::f = DblAdd<N, Tag>::func;
// z[N * 2] <- (x[N * 2] - y[N * 2]) mod p[N] << (N * UnitBitSize)
template<size_t N, class Tag = Gtag>
struct DblSub {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
if (SubPre<N * 2, Tag>::f(z, x, y)) {
AddPre<N, Tag>::f(z + N, z + N, p);
}
}
static const void4u f;
};
template<size_t N, class Tag>
const void4u DblSub<N, Tag>::f = DblSub<N, Tag>::func;
/*
z[N] <- montRed(xy[N * 2], p[N])
REMARK : assume p[-1] = rp
*/
template<size_t N, class Tag = Gtag>
struct MontRed {
static inline void func(Unit *z, const Unit *xy, const Unit *p)
{
const Unit rp = p[-1];
Unit pq[N + 1];
Unit buf[N * 2 + 1];
copyC<N - 1>(buf + N + 1, xy + N + 1);
buf[N * 2] = 0;
Unit q = xy[0] * rp;
MulUnitPre<N, Tag>::f(pq, p, q);
Unit up = AddPre<N + 1, Tag>::f(buf, xy, pq);
if (up) {
buf[N * 2] = AddUnitPre<Tag>::f(buf + N + 1, N - 1, 1);
}
Unit *c = buf + 1;
for (size_t i = 1; i < N; i++) {
q = c[0] * rp;
MulUnitPre<N, Tag>::f(pq, p, q);
up = AddPre<N + 1, Tag>::f(c, c, pq);
if (up) {
AddUnitPre<Tag>::f(c + N + 1, N - i, 1);
}
c++;
}
if (c[N]) {
SubPre<N, Tag>::f(z, c, p);
} else {
if (SubPre<N, Tag>::f(z, c, p)) {
memcpy(z, c, N * sizeof(Unit));
}
}
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u MontRed<N, Tag>::f = MontRed<N, Tag>::func;
/*
z[N] <- Montgomery(x[N], y[N], p[N])
REMARK : assume p[-1] = rp
*/
template<size_t N, bool isFullBit, class Tag = Gtag>
struct Mont {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
#if MCL_MAX_BIT_SIZE == 1024 || MCL_SIZEOF_UNIT == 4 // check speed
Unit xy[N * 2];
MulPre<N, Tag>::f(xy, x, y);
MontRed<N, Tag>::f(z, xy, p);
#else
const Unit rp = p[-1];
if (isFullBit) {
Unit buf[N * 2 + 2];
Unit *c = buf;
MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
Unit q = c[0] * rp;
Unit t[N + 2];
MulUnitPre<N, Tag>::f(t, p, q); // p * q
t[N + 1] = 0; // always zero
c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
c++;
for (size_t i = 1; i < N; i++) {
MulUnitPre<N, Tag>::f(t, x, y[i]);
c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
q = c[0] * rp;
MulUnitPre<N, Tag>::f(t, p, q);
AddPre<N + 2, Tag>::f(c, c, t);
c++;
}
if (c[N]) {
SubPre<N, Tag>::f(z, c, p);
} else {
if (SubPre<N, Tag>::f(z, c, p)) {
memcpy(z, c, N * sizeof(Unit));
}
}
} else {
/*
R = 1 << 64
L % 64 = 63 ; not full bit
F = 1 << (L + 1)
max p = (1 << L) - 1
x, y <= p - 1
max x * y[0], p * q <= ((1 << L) - 1)(R - 1)
t = x * y[i] + p * q <= 2((1 << L) - 1)(R - 1) = (F - 2)(R - 1)
t >> 64 <= (F - 2)(R - 1)/R = (F - 2) - (F - 2)/R
t + (t >> 64) = (F - 2)R - (F - 2)/R < FR
*/
Unit carry;
(void)carry;
Unit buf[N * 2 + 1];
Unit *c = buf;
MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
Unit q = c[0] * rp;
Unit t[N + 1];
MulUnitPre<N, Tag>::f(t, p, q); // p * q
carry = AddPre<N + 1, Tag>::f(c, c, t);
assert(carry == 0);
c++;
c[N] = 0;
for (size_t i = 1; i < N; i++) {
c[N + 1] = 0;
MulUnitPre<N, Tag>::f(t, x, y[i]);
carry = AddPre<N + 1, Tag>::f(c, c, t);
assert(carry == 0);
q = c[0] * rp;
MulUnitPre<N, Tag>::f(t, p, q);
carry = AddPre<N + 1, Tag>::f(c, c, t);
assert(carry == 0);
c++;
}
assert(c[N] == 0);
if (SubPre<N, Tag>::f(z, c, p)) {
memcpy(z, c, N * sizeof(Unit));
}
}
#endif
}
static const void4u f;
};
template<size_t N, bool isFullBit, class Tag>
const void4u Mont<N, isFullBit, Tag>::f = Mont<N, isFullBit, Tag>::func;
// z[N] <- Montgomery(x[N], x[N], p[N])
template<size_t N, bool isFullBit, class Tag = Gtag>
struct SqrMont {
static inline void func(Unit *y, const Unit *x, const Unit *p)
{
#if MCL_MAX_BIT_SIZE == 1024 || MCL_SIZEOF_UNIT == 4 // check speed
Unit xx[N * 2];
SqrPre<N, Tag>::f(xx, x);
MontRed<N, Tag>::f(y, xx, p);
#else
Mont<N, isFullBit, Tag>::f(y, x, x, p);
#endif
}
static const void3u f;
};
template<size_t N, bool isFullBit, class Tag>
const void3u SqrMont<N, isFullBit, Tag>::f = SqrMont<N, isFullBit, Tag>::func;
// z[N] <- (x[N] * y[N]) % p[N]
template<size_t N, class Tag = Gtag>
struct Mul {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
Unit xy[N * 2];
MulPre<N, Tag>::f(xy, x, y);
Dbl_Mod<N, Tag>::f(z, xy, p);
}
static const void4u f;
};
template<size_t N, class Tag>
const void4u Mul<N, Tag>::f = Mul<N, Tag>::func;
// y[N] <- (x[N] * x[N]) % p[N]
template<size_t N, class Tag = Gtag>
struct Sqr {
static inline void func(Unit *y, const Unit *x, const Unit *p)
{
Unit xx[N * 2];
SqrPre<N, Tag>::f(xx, x);
Dbl_Mod<N, Tag>::f(y, xx, p);
}
static const void3u f;
};
template<size_t N, class Tag>
const void3u Sqr<N, Tag>::f = Sqr<N, Tag>::func;
template<size_t N, class Tag = Gtag>
struct Fp2MulNF {
static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
{
const Unit *const a = x;
const Unit *const b = x + N;
const Unit *const c = y;
const Unit *const d = y + N;
Unit d0[N * 2];
Unit d1[N * 2];
Unit d2[N * 2];
Unit s[N];
Unit t[N];
AddPre<N, Tag>::f(s, a, b);
AddPre<N, Tag>::f(t, c, d);
MulPre<N, Tag>::f(d0, s, t);
MulPre<N, Tag>::f(d1, a, c);
MulPre<N, Tag>::f(d2, b, d);
SubPre<N * 2, Tag>::f(d0, d0, d1);
SubPre<N * 2, Tag>::f(d0, d0, d2);
MontRed<N, Tag>::f(z + N, d0, p);
DblSub<N, Tag>::f(d1, d1, d2, p);
MontRed<N, Tag>::f(z, d1, p);
}
static const void4u f;
};
template<size_t N, class Tag>
const void4u Fp2MulNF<N, Tag>::f = Fp2MulNF<N, Tag>::func;
} } // mcl::fp
#ifdef _MSC_VER
#pragma warning(pop)
#endif