aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn256/cloudflare
diff options
context:
space:
mode:
authorPéter Szilágyi <peterke@gmail.com>2018-03-20 00:13:54 +0800
committerGitHub <noreply@github.com>2018-03-20 00:13:54 +0800
commit1203c6a237cb87b78ec495772cecb178200499ce (patch)
treea51e6c3a24e43f265fc5c9b4f2bdb7ff7de6a8db /crypto/bn256/cloudflare
parent0965761a45562d609f6036963dbac84561174677 (diff)
downloaddexon-1203c6a237cb87b78ec495772cecb178200499ce.tar
dexon-1203c6a237cb87b78ec495772cecb178200499ce.tar.gz
dexon-1203c6a237cb87b78ec495772cecb178200499ce.tar.bz2
dexon-1203c6a237cb87b78ec495772cecb178200499ce.tar.lz
dexon-1203c6a237cb87b78ec495772cecb178200499ce.tar.xz
dexon-1203c6a237cb87b78ec495772cecb178200499ce.tar.zst
dexon-1203c6a237cb87b78ec495772cecb178200499ce.zip
crypto/bn256: full switchover to cloudflare's code (#16301)
* crypto/bn256: full switchover to cloudflare's code * crypto/bn256: only use cloudflare for optimized architectures * crypto/bn256: upstream fallback for non-optimized code * .travis, build: drop support for Go 1.8 (need type aliases) * crypto/bn256/cloudflare: enable curve mul lattice optimization
Diffstat (limited to 'crypto/bn256/cloudflare')
-rw-r--r--crypto/bn256/cloudflare/bn256_test.go2
-rw-r--r--crypto/bn256/cloudflare/curve.go19
-rw-r--r--crypto/bn256/cloudflare/example_test.go2
-rw-r--r--crypto/bn256/cloudflare/gfp.h32
-rw-r--r--crypto/bn256/cloudflare/gfp_amd64.go15
-rw-r--r--crypto/bn256/cloudflare/gfp_amd64.s42
-rw-r--r--crypto/bn256/cloudflare/gfp_arm64.s113
-rw-r--r--crypto/bn256/cloudflare/gfp_decl.go18
-rw-r--r--crypto/bn256/cloudflare/gfp_generic.go173
-rw-r--r--crypto/bn256/cloudflare/gfp_pure.go19
-rw-r--r--crypto/bn256/cloudflare/gfp_test.go2
-rw-r--r--crypto/bn256/cloudflare/lattice.go115
-rw-r--r--crypto/bn256/cloudflare/lattice_test.go29
-rw-r--r--crypto/bn256/cloudflare/main_test.go2
-rw-r--r--crypto/bn256/cloudflare/mul_amd64.h (renamed from crypto/bn256/cloudflare/mul.h)0
-rw-r--r--crypto/bn256/cloudflare/mul_arm64.h133
-rw-r--r--crypto/bn256/cloudflare/mul_bmi2_amd64.h (renamed from crypto/bn256/cloudflare/mul_bmi2.h)0
17 files changed, 632 insertions, 84 deletions
diff --git a/crypto/bn256/cloudflare/bn256_test.go b/crypto/bn256/cloudflare/bn256_test.go
index 369a3edaa..0c8016d86 100644
--- a/crypto/bn256/cloudflare/bn256_test.go
+++ b/crypto/bn256/cloudflare/bn256_test.go
@@ -1,5 +1,3 @@
-// +build amd64,!appengine,!gccgo
-
package bn256
import (
diff --git a/crypto/bn256/cloudflare/curve.go b/crypto/bn256/cloudflare/curve.go
index b6aecc0a6..18e9b38f3 100644
--- a/crypto/bn256/cloudflare/curve.go
+++ b/crypto/bn256/cloudflare/curve.go
@@ -183,15 +183,24 @@ func (c *curvePoint) Double(a *curvePoint) {
}
func (c *curvePoint) Mul(a *curvePoint, scalar *big.Int) {
- sum, t := &curvePoint{}, &curvePoint{}
+ precomp := [1 << 2]*curvePoint{nil, {}, {}, {}}
+ precomp[1].Set(a)
+ precomp[2].Set(a)
+ gfpMul(&precomp[2].x, &precomp[2].x, xiTo2PSquaredMinus2Over3)
+ precomp[3].Add(precomp[1], precomp[2])
+
+ multiScalar := curveLattice.Multi(scalar)
+
+ sum := &curvePoint{}
sum.SetInfinity()
+ t := &curvePoint{}
- for i := scalar.BitLen(); i >= 0; i-- {
+ for i := len(multiScalar) - 1; i >= 0; i-- {
t.Double(sum)
- if scalar.Bit(i) != 0 {
- sum.Add(t, a)
- } else {
+ if multiScalar[i] == 0 {
sum.Set(t)
+ } else {
+ sum.Add(t, precomp[multiScalar[i]])
}
}
c.Set(sum)
diff --git a/crypto/bn256/cloudflare/example_test.go b/crypto/bn256/cloudflare/example_test.go
index 2ee545c67..b2d19807a 100644
--- a/crypto/bn256/cloudflare/example_test.go
+++ b/crypto/bn256/cloudflare/example_test.go
@@ -2,8 +2,6 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64,!appengine,!gccgo
-
package bn256
import (
diff --git a/crypto/bn256/cloudflare/gfp.h b/crypto/bn256/cloudflare/gfp.h
deleted file mode 100644
index 66f5a4d07..000000000
--- a/crypto/bn256/cloudflare/gfp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#define storeBlock(a0,a1,a2,a3, r) \
- MOVQ a0, 0+r \
- MOVQ a1, 8+r \
- MOVQ a2, 16+r \
- MOVQ a3, 24+r
-
-#define loadBlock(r, a0,a1,a2,a3) \
- MOVQ 0+r, a0 \
- MOVQ 8+r, a1 \
- MOVQ 16+r, a2 \
- MOVQ 24+r, a3
-
-#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
- \ // b = a-p
- MOVQ a0, b0 \
- MOVQ a1, b1 \
- MOVQ a2, b2 \
- MOVQ a3, b3 \
- MOVQ a4, b4 \
- \
- SUBQ ·p2+0(SB), b0 \
- SBBQ ·p2+8(SB), b1 \
- SBBQ ·p2+16(SB), b2 \
- SBBQ ·p2+24(SB), b3 \
- SBBQ $0, b4 \
- \
- \ // if b is negative then return a
- \ // else return b
- CMOVQCC b0, a0 \
- CMOVQCC b1, a1 \
- CMOVQCC b2, a2 \
- CMOVQCC b3, a3
diff --git a/crypto/bn256/cloudflare/gfp_amd64.go b/crypto/bn256/cloudflare/gfp_amd64.go
deleted file mode 100644
index ac4f1a9c6..000000000
--- a/crypto/bn256/cloudflare/gfp_amd64.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// +build amd64,!appengine,!gccgo
-
-package bn256
-
-// go:noescape
-func gfpNeg(c, a *gfP)
-
-//go:noescape
-func gfpAdd(c, a, b *gfP)
-
-//go:noescape
-func gfpSub(c, a, b *gfP)
-
-//go:noescape
-func gfpMul(c, a, b *gfP)
diff --git a/crypto/bn256/cloudflare/gfp_amd64.s b/crypto/bn256/cloudflare/gfp_amd64.s
index 2d0176f2e..3a785d200 100644
--- a/crypto/bn256/cloudflare/gfp_amd64.s
+++ b/crypto/bn256/cloudflare/gfp_amd64.s
@@ -1,8 +1,40 @@
-// +build amd64,!appengine,!gccgo
-
-#include "gfp.h"
-#include "mul.h"
-#include "mul_bmi2.h"
+// +build amd64,!generic
+
+#define storeBlock(a0,a1,a2,a3, r) \
+ MOVQ a0, 0+r \
+ MOVQ a1, 8+r \
+ MOVQ a2, 16+r \
+ MOVQ a3, 24+r
+
+#define loadBlock(r, a0,a1,a2,a3) \
+ MOVQ 0+r, a0 \
+ MOVQ 8+r, a1 \
+ MOVQ 16+r, a2 \
+ MOVQ 24+r, a3
+
+#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
+ \ // b = a-p
+ MOVQ a0, b0 \
+ MOVQ a1, b1 \
+ MOVQ a2, b2 \
+ MOVQ a3, b3 \
+ MOVQ a4, b4 \
+ \
+ SUBQ ·p2+0(SB), b0 \
+ SBBQ ·p2+8(SB), b1 \
+ SBBQ ·p2+16(SB), b2 \
+ SBBQ ·p2+24(SB), b3 \
+ SBBQ $0, b4 \
+ \
+ \ // if b is negative then return a
+ \ // else return b
+ CMOVQCC b0, a0 \
+ CMOVQCC b1, a1 \
+ CMOVQCC b2, a2 \
+ CMOVQCC b3, a3
+
+#include "mul_amd64.h"
+#include "mul_bmi2_amd64.h"
TEXT ·gfpNeg(SB),0,$0-16
MOVQ ·p2+0(SB), R8
diff --git a/crypto/bn256/cloudflare/gfp_arm64.s b/crypto/bn256/cloudflare/gfp_arm64.s
new file mode 100644
index 000000000..c65e80168
--- /dev/null
+++ b/crypto/bn256/cloudflare/gfp_arm64.s
@@ -0,0 +1,113 @@
+// +build arm64,!generic
+
+#define storeBlock(a0,a1,a2,a3, r) \
+ MOVD a0, 0+r \
+ MOVD a1, 8+r \
+ MOVD a2, 16+r \
+ MOVD a3, 24+r
+
+#define loadBlock(r, a0,a1,a2,a3) \
+ MOVD 0+r, a0 \
+ MOVD 8+r, a1 \
+ MOVD 16+r, a2 \
+ MOVD 24+r, a3
+
+#define loadModulus(p0,p1,p2,p3) \
+ MOVD ·p2+0(SB), p0 \
+ MOVD ·p2+8(SB), p1 \
+ MOVD ·p2+16(SB), p2 \
+ MOVD ·p2+24(SB), p3
+
+#include "mul_arm64.h"
+
+TEXT ·gfpNeg(SB),0,$0-16
+ MOVD a+8(FP), R0
+ loadBlock(0(R0), R1,R2,R3,R4)
+ loadModulus(R5,R6,R7,R8)
+
+ SUBS R1, R5, R1
+ SBCS R2, R6, R2
+ SBCS R3, R7, R3
+ SBCS R4, R8, R4
+
+ SUBS R5, R1, R5
+ SBCS R6, R2, R6
+ SBCS R7, R3, R7
+ SBCS R8, R4, R8
+
+ CSEL CS, R5, R1, R1
+ CSEL CS, R6, R2, R2
+ CSEL CS, R7, R3, R3
+ CSEL CS, R8, R4, R4
+
+ MOVD c+0(FP), R0
+ storeBlock(R1,R2,R3,R4, 0(R0))
+ RET
+
+TEXT ·gfpAdd(SB),0,$0-24
+ MOVD a+8(FP), R0
+ loadBlock(0(R0), R1,R2,R3,R4)
+ MOVD b+16(FP), R0
+ loadBlock(0(R0), R5,R6,R7,R8)
+ loadModulus(R9,R10,R11,R12)
+ MOVD ZR, R0
+
+ ADDS R5, R1
+ ADCS R6, R2
+ ADCS R7, R3
+ ADCS R8, R4
+ ADCS ZR, R0
+
+ SUBS R9, R1, R5
+ SBCS R10, R2, R6
+ SBCS R11, R3, R7
+ SBCS R12, R4, R8
+ SBCS ZR, R0, R0
+
+ CSEL CS, R5, R1, R1
+ CSEL CS, R6, R2, R2
+ CSEL CS, R7, R3, R3
+ CSEL CS, R8, R4, R4
+
+ MOVD c+0(FP), R0
+ storeBlock(R1,R2,R3,R4, 0(R0))
+ RET
+
+TEXT ·gfpSub(SB),0,$0-24
+ MOVD a+8(FP), R0
+ loadBlock(0(R0), R1,R2,R3,R4)
+ MOVD b+16(FP), R0
+ loadBlock(0(R0), R5,R6,R7,R8)
+ loadModulus(R9,R10,R11,R12)
+
+ SUBS R5, R1
+ SBCS R6, R2
+ SBCS R7, R3
+ SBCS R8, R4
+
+ CSEL CS, ZR, R9, R9
+ CSEL CS, ZR, R10, R10
+ CSEL CS, ZR, R11, R11
+ CSEL CS, ZR, R12, R12
+
+ ADDS R9, R1
+ ADCS R10, R2
+ ADCS R11, R3
+ ADCS R12, R4
+
+ MOVD c+0(FP), R0
+ storeBlock(R1,R2,R3,R4, 0(R0))
+ RET
+
+TEXT ·gfpMul(SB),0,$0-24
+ MOVD a+8(FP), R0
+ loadBlock(0(R0), R1,R2,R3,R4)
+ MOVD b+16(FP), R0
+ loadBlock(0(R0), R5,R6,R7,R8)
+
+ mul(R9,R10,R11,R12,R13,R14,R15,R16)
+ gfpReduce()
+
+ MOVD c+0(FP), R0
+ storeBlock(R1,R2,R3,R4, 0(R0))
+ RET
diff --git a/crypto/bn256/cloudflare/gfp_decl.go b/crypto/bn256/cloudflare/gfp_decl.go
new file mode 100644
index 000000000..6a8a4fddb
--- /dev/null
+++ b/crypto/bn256/cloudflare/gfp_decl.go
@@ -0,0 +1,18 @@
+// +build amd64,!generic arm64,!generic
+
+package bn256
+
+// This file contains forward declarations for the architecture-specific
+// assembly implementations of these functions, provided that they exist.
+
+// go:noescape
+func gfpNeg(c, a *gfP)
+
+//go:noescape
+func gfpAdd(c, a, b *gfP)
+
+//go:noescape
+func gfpSub(c, a, b *gfP)
+
+//go:noescape
+func gfpMul(c, a, b *gfP)
diff --git a/crypto/bn256/cloudflare/gfp_generic.go b/crypto/bn256/cloudflare/gfp_generic.go
new file mode 100644
index 000000000..8e6be9596
--- /dev/null
+++ b/crypto/bn256/cloudflare/gfp_generic.go
@@ -0,0 +1,173 @@
+// +build !amd64,!arm64 generic
+
+package bn256
+
+func gfpCarry(a *gfP, head uint64) {
+ b := &gfP{}
+
+ var carry uint64
+ for i, pi := range p2 {
+ ai := a[i]
+ bi := ai - pi - carry
+ b[i] = bi
+ carry = (pi&^ai | (pi|^ai)&bi) >> 63
+ }
+ carry = carry &^ head
+
+ // If b is negative, then return a.
+ // Else return b.
+ carry = -carry
+ ncarry := ^carry
+ for i := 0; i < 4; i++ {
+ a[i] = (a[i] & carry) | (b[i] & ncarry)
+ }
+}
+
+func gfpNeg(c, a *gfP) {
+ var carry uint64
+ for i, pi := range p2 {
+ ai := a[i]
+ ci := pi - ai - carry
+ c[i] = ci
+ carry = (ai&^pi | (ai|^pi)&ci) >> 63
+ }
+ gfpCarry(c, 0)
+}
+
+func gfpAdd(c, a, b *gfP) {
+ var carry uint64
+ for i, ai := range a {
+ bi := b[i]
+ ci := ai + bi + carry
+ c[i] = ci
+ carry = (ai&bi | (ai|bi)&^ci) >> 63
+ }
+ gfpCarry(c, carry)
+}
+
+func gfpSub(c, a, b *gfP) {
+ t := &gfP{}
+
+ var carry uint64
+ for i, pi := range p2 {
+ bi := b[i]
+ ti := pi - bi - carry
+ t[i] = ti
+ carry = (bi&^pi | (bi|^pi)&ti) >> 63
+ }
+
+ carry = 0
+ for i, ai := range a {
+ ti := t[i]
+ ci := ai + ti + carry
+ c[i] = ci
+ carry = (ai&ti | (ai|ti)&^ci) >> 63
+ }
+ gfpCarry(c, carry)
+}
+
+func mul(a, b [4]uint64) [8]uint64 {
+ const (
+ mask16 uint64 = 0x0000ffff
+ mask32 uint64 = 0xffffffff
+ )
+
+ var buff [32]uint64
+ for i, ai := range a {
+ a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
+
+ for j, bj := range b {
+ b0, b2 := bj&mask32, bj>>32
+
+ off := 4 * (i + j)
+ buff[off+0] += a0 * b0
+ buff[off+1] += a1 * b0
+ buff[off+2] += a2*b0 + a0*b2
+ buff[off+3] += a3*b0 + a1*b2
+ buff[off+4] += a2 * b2
+ buff[off+5] += a3 * b2
+ }
+ }
+
+ for i := uint(1); i < 4; i++ {
+ shift := 16 * i
+
+ var head, carry uint64
+ for j := uint(0); j < 8; j++ {
+ block := 4 * j
+
+ xi := buff[block]
+ yi := (buff[block+i] << shift) + head
+ zi := xi + yi + carry
+ buff[block] = zi
+ carry = (xi&yi | (xi|yi)&^zi) >> 63
+
+ head = buff[block+i] >> (64 - shift)
+ }
+ }
+
+ return [8]uint64{buff[0], buff[4], buff[8], buff[12], buff[16], buff[20], buff[24], buff[28]}
+}
+
+func halfMul(a, b [4]uint64) [4]uint64 {
+ const (
+ mask16 uint64 = 0x0000ffff
+ mask32 uint64 = 0xffffffff
+ )
+
+ var buff [18]uint64
+ for i, ai := range a {
+ a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
+
+ for j, bj := range b {
+ if i+j > 3 {
+ break
+ }
+ b0, b2 := bj&mask32, bj>>32
+
+ off := 4 * (i + j)
+ buff[off+0] += a0 * b0
+ buff[off+1] += a1 * b0
+ buff[off+2] += a2*b0 + a0*b2
+ buff[off+3] += a3*b0 + a1*b2
+ buff[off+4] += a2 * b2
+ buff[off+5] += a3 * b2
+ }
+ }
+
+ for i := uint(1); i < 4; i++ {
+ shift := 16 * i
+
+ var head, carry uint64
+ for j := uint(0); j < 4; j++ {
+ block := 4 * j
+
+ xi := buff[block]
+ yi := (buff[block+i] << shift) + head
+ zi := xi + yi + carry
+ buff[block] = zi
+ carry = (xi&yi | (xi|yi)&^zi) >> 63
+
+ head = buff[block+i] >> (64 - shift)
+ }
+ }
+
+ return [4]uint64{buff[0], buff[4], buff[8], buff[12]}
+}
+
+func gfpMul(c, a, b *gfP) {
+ T := mul(*a, *b)
+ m := halfMul([4]uint64{T[0], T[1], T[2], T[3]}, np)
+ t := mul([4]uint64{m[0], m[1], m[2], m[3]}, p2)
+
+ var carry uint64
+ for i, Ti := range T {
+ ti := t[i]
+ zi := Ti + ti + carry
+ T[i] = zi
+ carry = (Ti&ti | (Ti|ti)&^zi) >> 63
+ }
+
+ *c = gfP{T[4], T[5], T[6], T[7]}
+ gfpCarry(c, carry)
+}
diff --git a/crypto/bn256/cloudflare/gfp_pure.go b/crypto/bn256/cloudflare/gfp_pure.go
deleted file mode 100644
index 8fa5d3053..000000000
--- a/crypto/bn256/cloudflare/gfp_pure.go
+++ /dev/null
@@ -1,19 +0,0 @@
-// +build !amd64 appengine gccgo
-
-package bn256
-
-func gfpNeg(c, a *gfP) {
- panic("unsupported architecture")
-}
-
-func gfpAdd(c, a, b *gfP) {
- panic("unsupported architecture")
-}
-
-func gfpSub(c, a, b *gfP) {
- panic("unsupported architecture")
-}
-
-func gfpMul(c, a, b *gfP) {
- panic("unsupported architecture")
-}
diff --git a/crypto/bn256/cloudflare/gfp_test.go b/crypto/bn256/cloudflare/gfp_test.go
index aff5e0531..16ab2a841 100644
--- a/crypto/bn256/cloudflare/gfp_test.go
+++ b/crypto/bn256/cloudflare/gfp_test.go
@@ -1,5 +1,3 @@
-// +build amd64,!appengine,!gccgo
-
package bn256
import (
diff --git a/crypto/bn256/cloudflare/lattice.go b/crypto/bn256/cloudflare/lattice.go
new file mode 100644
index 000000000..f9ace4d9f
--- /dev/null
+++ b/crypto/bn256/cloudflare/lattice.go
@@ -0,0 +1,115 @@
+package bn256
+
+import (
+ "math/big"
+)
+
+var half = new(big.Int).Rsh(Order, 1)
+
+var curveLattice = &lattice{
+ vectors: [][]*big.Int{
+ {bigFromBase10("147946756881789319000765030803803410728"), bigFromBase10("147946756881789319010696353538189108491")},
+ {bigFromBase10("147946756881789319020627676272574806254"), bigFromBase10("-147946756881789318990833708069417712965")},
+ },
+ inverse: []*big.Int{
+ bigFromBase10("147946756881789318990833708069417712965"),
+ bigFromBase10("147946756881789319010696353538189108491"),
+ },
+ det: bigFromBase10("43776485743678550444492811490514550177096728800832068687396408373151616991234"),
+}
+
+var targetLattice = &lattice{
+ vectors: [][]*big.Int{
+ {bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697763"), bigFromBase10("9931322734385697764")},
+ {bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848882"), bigFromBase10("-9931322734385697762")},
+ {bigFromBase10("-9931322734385697762"), bigFromBase10("-4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("-4965661367192848882")},
+ {bigFromBase10("9931322734385697763"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881")},
+ },
+ inverse: []*big.Int{
+ bigFromBase10("734653495049373973658254490726798021314063399421879442165"),
+ bigFromBase10("147946756881789319000765030803803410728"),
+ bigFromBase10("-147946756881789319005730692170996259609"),
+ bigFromBase10("1469306990098747947464455738335385361643788813749140841702"),
+ },
+ det: new(big.Int).Set(Order),
+}
+
+type lattice struct {
+ vectors [][]*big.Int
+ inverse []*big.Int
+ det *big.Int
+}
+
+// decompose takes a scalar mod Order as input and finds a short, positive decomposition of it wrt to the lattice basis.
+func (l *lattice) decompose(k *big.Int) []*big.Int {
+ n := len(l.inverse)
+
+ // Calculate closest vector in lattice to <k,0,0,...> with Babai's rounding.
+ c := make([]*big.Int, n)
+ for i := 0; i < n; i++ {
+ c[i] = new(big.Int).Mul(k, l.inverse[i])
+ round(c[i], l.det)
+ }
+
+ // Transform vectors according to c and subtract <k,0,0,...>.
+ out := make([]*big.Int, n)
+ temp := new(big.Int)
+
+ for i := 0; i < n; i++ {
+ out[i] = new(big.Int)
+
+ for j := 0; j < n; j++ {
+ temp.Mul(c[j], l.vectors[j][i])
+ out[i].Add(out[i], temp)
+ }
+
+ out[i].Neg(out[i])
+ out[i].Add(out[i], l.vectors[0][i]).Add(out[i], l.vectors[0][i])
+ }
+ out[0].Add(out[0], k)
+
+ return out
+}
+
+func (l *lattice) Precompute(add func(i, j uint)) {
+ n := uint(len(l.vectors))
+ total := uint(1) << n
+
+ for i := uint(0); i < n; i++ {
+ for j := uint(0); j < total; j++ {
+ if (j>>i)&1 == 1 {
+ add(i, j)
+ }
+ }
+ }
+}
+
+func (l *lattice) Multi(scalar *big.Int) []uint8 {
+ decomp := l.decompose(scalar)
+
+ maxLen := 0
+ for _, x := range decomp {
+ if x.BitLen() > maxLen {
+ maxLen = x.BitLen()
+ }
+ }
+
+ out := make([]uint8, maxLen)
+ for j, x := range decomp {
+ for i := 0; i < maxLen; i++ {
+ out[i] += uint8(x.Bit(i)) << uint(j)
+ }
+ }
+
+ return out
+}
+
+// round sets num to num/denom rounded to the nearest integer.
+func round(num, denom *big.Int) {
+ r := new(big.Int)
+ num.DivMod(num, denom, r)
+
+ if r.Cmp(half) == 1 {
+ num.Add(num, big.NewInt(1))
+ }
+}
diff --git a/crypto/bn256/cloudflare/lattice_test.go b/crypto/bn256/cloudflare/lattice_test.go
new file mode 100644
index 000000000..4d52ad9b2
--- /dev/null
+++ b/crypto/bn256/cloudflare/lattice_test.go
@@ -0,0 +1,29 @@
+package bn256
+
+import (
+ "crypto/rand"
+
+ "testing"
+)
+
+func TestLatticeReduceCurve(t *testing.T) {
+ k, _ := rand.Int(rand.Reader, Order)
+ ks := curveLattice.decompose(k)
+
+ if ks[0].BitLen() > 130 || ks[1].BitLen() > 130 {
+ t.Fatal("reduction too large")
+ } else if ks[0].Sign() < 0 || ks[1].Sign() < 0 {
+ t.Fatal("reduction must be positive")
+ }
+}
+
+func TestLatticeReduceTarget(t *testing.T) {
+ k, _ := rand.Int(rand.Reader, Order)
+ ks := targetLattice.decompose(k)
+
+ if ks[0].BitLen() > 66 || ks[1].BitLen() > 66 || ks[2].BitLen() > 66 || ks[3].BitLen() > 66 {
+ t.Fatal("reduction too large")
+ } else if ks[0].Sign() < 0 || ks[1].Sign() < 0 || ks[2].Sign() < 0 || ks[3].Sign() < 0 {
+ t.Fatal("reduction must be positive")
+ }
+}
diff --git a/crypto/bn256/cloudflare/main_test.go b/crypto/bn256/cloudflare/main_test.go
index f0d59a404..0230f1b19 100644
--- a/crypto/bn256/cloudflare/main_test.go
+++ b/crypto/bn256/cloudflare/main_test.go
@@ -1,5 +1,3 @@
-// +build amd64,!appengine,!gccgo
-
package bn256
import (
diff --git a/crypto/bn256/cloudflare/mul.h b/crypto/bn256/cloudflare/mul_amd64.h
index bab5da831..bab5da831 100644
--- a/crypto/bn256/cloudflare/mul.h
+++ b/crypto/bn256/cloudflare/mul_amd64.h
diff --git a/crypto/bn256/cloudflare/mul_arm64.h b/crypto/bn256/cloudflare/mul_arm64.h
new file mode 100644
index 000000000..75d522173
--- /dev/null
+++ b/crypto/bn256/cloudflare/mul_arm64.h
@@ -0,0 +1,133 @@
+#define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
+ MUL R1, R5, c0 \
+ UMULH R1, R5, c1 \
+ MUL R1, R6, R0 \
+ ADDS R0, c1 \
+ UMULH R1, R6, c2 \
+ MUL R1, R7, R0 \
+ ADCS R0, c2 \
+ UMULH R1, R7, c3 \
+ MUL R1, R8, R0 \
+ ADCS R0, c3 \
+ UMULH R1, R8, c4 \
+ ADCS ZR, c4 \
+ \
+ MUL R2, R5, R25 \
+ UMULH R2, R5, R26 \
+ MUL R2, R6, R0 \
+ ADDS R0, R26 \
+ UMULH R2, R6, R27 \
+ MUL R2, R7, R0 \
+ ADCS R0, R27 \
+ UMULH R2, R7, R29 \
+ MUL R2, R8, R0 \
+ ADCS R0, R29 \
+ UMULH R2, R8, c5 \
+ ADCS ZR, c5 \
+ ADDS R25, c1 \
+ ADCS R26, c2 \
+ ADCS R27, c3 \
+ ADCS R29, c4 \
+ ADCS ZR, c5 \
+ \
+ MUL R3, R5, R25 \
+ UMULH R3, R5, R26 \
+ MUL R3, R6, R0 \
+ ADDS R0, R26 \
+ UMULH R3, R6, R27 \
+ MUL R3, R7, R0 \
+ ADCS R0, R27 \
+ UMULH R3, R7, R29 \
+ MUL R3, R8, R0 \
+ ADCS R0, R29 \
+ UMULH R3, R8, c6 \
+ ADCS ZR, c6 \
+ ADDS R25, c2 \
+ ADCS R26, c3 \
+ ADCS R27, c4 \
+ ADCS R29, c5 \
+ ADCS ZR, c6 \
+ \
+ MUL R4, R5, R25 \
+ UMULH R4, R5, R26 \
+ MUL R4, R6, R0 \
+ ADDS R0, R26 \
+ UMULH R4, R6, R27 \
+ MUL R4, R7, R0 \
+ ADCS R0, R27 \
+ UMULH R4, R7, R29 \
+ MUL R4, R8, R0 \
+ ADCS R0, R29 \
+ UMULH R4, R8, c7 \
+ ADCS ZR, c7 \
+ ADDS R25, c3 \
+ ADCS R26, c4 \
+ ADCS R27, c5 \
+ ADCS R29, c6 \
+ ADCS ZR, c7
+
+#define gfpReduce() \
+ \ // m = (T * N') mod R, store m in R1:R2:R3:R4
+ MOVD ·np+0(SB), R17 \
+ MOVD ·np+8(SB), R18 \
+ MOVD ·np+16(SB), R19 \
+ MOVD ·np+24(SB), R20 \
+ \
+ MUL R9, R17, R1 \
+ UMULH R9, R17, R2 \
+ MUL R9, R18, R0 \
+ ADDS R0, R2 \
+ UMULH R9, R18, R3 \
+ MUL R9, R19, R0 \
+ ADCS R0, R3 \
+ UMULH R9, R19, R4 \
+ MUL R9, R20, R0 \
+ ADCS R0, R4 \
+ \
+ MUL R10, R17, R21 \
+ UMULH R10, R17, R22 \
+ MUL R10, R18, R0 \
+ ADDS R0, R22 \
+ UMULH R10, R18, R23 \
+ MUL R10, R19, R0 \
+ ADCS R0, R23 \
+ ADDS R21, R2 \
+ ADCS R22, R3 \
+ ADCS R23, R4 \
+ \
+ MUL R11, R17, R21 \
+ UMULH R11, R17, R22 \
+ MUL R11, R18, R0 \
+ ADDS R0, R22 \
+ ADDS R21, R3 \
+ ADCS R22, R4 \
+ \
+ MUL R12, R17, R21 \
+ ADDS R21, R4 \
+ \
+ \ // m * N
+ loadModulus(R5,R6,R7,R8) \
+ mul(R17,R18,R19,R20,R21,R22,R23,R24) \
+ \
+ \ // Add the 512-bit intermediate to m*N
+ MOVD ZR, R25 \
+ ADDS R9, R17 \
+ ADCS R10, R18 \
+ ADCS R11, R19 \
+ ADCS R12, R20 \
+ ADCS R13, R21 \
+ ADCS R14, R22 \
+ ADCS R15, R23 \
+ ADCS R16, R24 \
+ ADCS ZR, R25 \
+ \
+ \ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
+ SUBS R5, R21, R10 \
+ SBCS R6, R22, R11 \
+ SBCS R7, R23, R12 \
+ SBCS R8, R24, R13 \
+ \
+ CSEL CS, R10, R21, R1 \
+ CSEL CS, R11, R22, R2 \
+ CSEL CS, R12, R23, R3 \
+ CSEL CS, R13, R24, R4
diff --git a/crypto/bn256/cloudflare/mul_bmi2.h b/crypto/bn256/cloudflare/mul_bmi2_amd64.h
index 71ad0499a..71ad0499a 100644
--- a/crypto/bn256/cloudflare/mul_bmi2.h
+++ b/crypto/bn256/cloudflare/mul_bmi2_amd64.h