20 files changed, 780 insertions, 150 deletions
diff --git a/crypto/bn256/bn256_other.go b/crypto/bn256/bn256_fast.go
index 81977a0a8..a8dfa8f67 100644
--- a/crypto/bn256/bn256_other.go
+++ b/crypto/bn256/bn256_fast.go
@@ -14,50 +14,22 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
 
-// +build !amd64 appengine gccgo
+// +build amd64 arm64
 
 // Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve.
 package bn256
 
-import (
-	"math/big"
-
-	"github.com/ethereum/go-ethereum/crypto/bn256/google"
-)
+import "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare"
 
 // G1 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G1 struct {
-	bn256.G1
-}
-
-// Add sets e to a+b and then returns e.
-func (e *G1) Add(a, b *G1) *G1 {
-	e.G1.Add(&a.G1, &b.G1)
-	return e
-}
-
-// ScalarMult sets e to a*k and then returns e.
-func (e *G1) ScalarMult(a *G1, k *big.Int) *G1 {
-	e.G1.ScalarMult(&a.G1, k)
-	return e
-}
+type G1 = bn256.G1
 
 // G2 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G2 struct {
-	bn256.G2
-}
+type G2 = bn256.G2
 
 // PairingCheck calculates the Optimal Ate pairing for a set of points.
 func PairingCheck(a []*G1, b []*G2) bool {
-	as := make([]*bn256.G1, len(a))
-	for i, p := range a {
-		as[i] = &p.G1
-	}
-	bs := make([]*bn256.G2, len(b))
-	for i, p := range b {
-		bs[i] = &p.G2
-	}
-	return bn256.PairingCheck(as, bs)
+	return bn256.PairingCheck(a, b)
 }
diff --git a/crypto/bn256/bn256_fuzz.go b/crypto/bn256/bn256_fuzz.go
new file mode 100644
index 000000000..f360b0541
--- /dev/null
+++ b/crypto/bn256/bn256_fuzz.go
@@ -0,0 +1,138 @@
+// Copyright 2018 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build gofuzz
+
+package bn256
+
+import (
+	"bytes"
+	"math/big"
+
+	cloudflare "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare"
+	google "github.com/ethereum/go-ethereum/crypto/bn256/google"
+)
+
+// FuzzAdd fuzzez bn256 addition between the Google and Cloudflare libraries.
+func FuzzAdd(data []byte) int {
+	// Ensure we have enough data in the first place
+	if len(data) != 128 {
+		return 0
+	}
+	// Ensure both libs can parse the first curve point
+	xc := new(cloudflare.G1)
+	_, errc := xc.Unmarshal(data[:64])
+
+	xg := new(google.G1)
+	_, errg := xg.Unmarshal(data[:64])
+
+	if (errc == nil) != (errg == nil) {
+		panic("parse mismatch")
+	} else if errc != nil {
+		return 0
+	}
+	// Ensure both libs can parse the second curve point
+	yc := new(cloudflare.G1)
+	_, errc = yc.Unmarshal(data[64:])
+
+	yg := new(google.G1)
+	_, errg = yg.Unmarshal(data[64:])
+
+	if (errc == nil) != (errg == nil) {
+		panic("parse mismatch")
+	} else if errc != nil {
+		return 0
+	}
+	// Add the two points and ensure they result in the same output
+	rc := new(cloudflare.G1)
+	rc.Add(xc, yc)
+
+	rg := new(google.G1)
+	rg.Add(xg, yg)
+
+	if !bytes.Equal(rc.Marshal(), rg.Marshal()) {
+		panic("add mismatch")
+	}
+	return 0
+}
+
+// FuzzMul fuzzez bn256 scalar multiplication between the Google and Cloudflare
+// libraries.
+func FuzzMul(data []byte) int {
+	// Ensure we have enough data in the first place
+	if len(data) != 96 {
+		return 0
+	}
+	// Ensure both libs can parse the curve point
+	pc := new(cloudflare.G1)
+	_, errc := pc.Unmarshal(data[:64])
+
+	pg := new(google.G1)
+	_, errg := pg.Unmarshal(data[:64])
+
+	if (errc == nil) != (errg == nil) {
+		panic("parse mismatch")
+	} else if errc != nil {
+		return 0
+	}
+	// Add the two points and ensure they result in the same output
+	rc := new(cloudflare.G1)
+	rc.ScalarMult(pc, new(big.Int).SetBytes(data[64:]))
+
+	rg := new(google.G1)
+	rg.ScalarMult(pg, new(big.Int).SetBytes(data[64:]))
+
+	if !bytes.Equal(rc.Marshal(), rg.Marshal()) {
+		panic("scalar mul mismatch")
+	}
+	return 0
+}
+
+func FuzzPair(data []byte) int {
+	// Ensure we have enough data in the first place
+	if len(data) != 192 {
+		return 0
+	}
+	// Ensure both libs can parse the curve point
+	pc := new(cloudflare.G1)
+	_, errc := pc.Unmarshal(data[:64])
+
+	pg := new(google.G1)
+	_, errg := pg.Unmarshal(data[:64])
+
+	if (errc == nil) != (errg == nil) {
+		panic("parse mismatch")
+	} else if errc != nil {
+		return 0
+	}
+	// Ensure both libs can parse the twist point
+	tc := new(cloudflare.G2)
+	_, errc = tc.Unmarshal(data[64:])
+
+	tg := new(google.G2)
+	_, errg = tg.Unmarshal(data[64:])
+
+	if (errc == nil) != (errg == nil) {
+		panic("parse mismatch")
+	} else if errc != nil {
+		return 0
+	}
+	// Pair the two points and ensure thet result in the same output
+	if cloudflare.PairingCheck([]*cloudflare.G1{pc}, []*cloudflare.G2{tc}) != google.PairingCheck([]*google.G1{pg}, []*google.G2{tg}) {
+		panic("pair mismatch")
+	}
+	return 0
+}
diff --git a/crypto/bn256/bn256_amd64.go b/crypto/bn256/bn256_slow.go
index 35b4839c2..61373763b 100644
--- a/crypto/bn256/bn256_amd64.go
+++ b/crypto/bn256/bn256_slow.go
@@ -14,50 +14,22 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
 
-// +build amd64,!appengine,!gccgo
+// +build !amd64,!arm64
 
 // Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve.
 package bn256
 
-import (
-	"math/big"
-
-	"github.com/ethereum/go-ethereum/crypto/bn256/cloudflare"
-)
+import "github.com/ethereum/go-ethereum/crypto/bn256/google"
 
 // G1 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G1 struct {
-	bn256.G1
-}
-
-// Add sets e to a+b and then returns e.
-func (e *G1) Add(a, b *G1) *G1 {
-	e.G1.Add(&a.G1, &b.G1)
-	return e
-}
-
-// ScalarMult sets e to a*k and then returns e.
-func (e *G1) ScalarMult(a *G1, k *big.Int) *G1 {
-	e.G1.ScalarMult(&a.G1, k)
-	return e
-}
+type G1 = bn256.G1
 
 // G2 is an abstract cyclic group. The zero value is suitable for use as the
 // output of an operation, but cannot be used as an input.
-type G2 struct {
-	bn256.G2
-}
+type G2 = bn256.G2
 
 // PairingCheck calculates the Optimal Ate pairing for a set of points.
 func PairingCheck(a []*G1, b []*G2) bool {
-	as := make([]*bn256.G1, len(a))
-	for i, p := range a {
-		as[i] = &p.G1
-	}
-	bs := make([]*bn256.G2, len(b))
-	for i, p := range b {
-		bs[i] = &p.G2
-	}
-	return bn256.PairingCheck(as, bs)
+	return bn256.PairingCheck(a, b)
 }
diff --git a/crypto/bn256/cloudflare/bn256_test.go b/crypto/bn256/cloudflare/bn256_test.go
index 369a3edaa..0c8016d86 100644
--- a/crypto/bn256/cloudflare/bn256_test.go
+++ b/crypto/bn256/cloudflare/bn256_test.go
@@ -1,5 +1,3 @@
-// +build amd64,!appengine,!gccgo
-
 package bn256
 
 import (
diff --git a/crypto/bn256/cloudflare/curve.go b/crypto/bn256/cloudflare/curve.go
index b6aecc0a6..18e9b38f3 100644
--- a/crypto/bn256/cloudflare/curve.go
+++ b/crypto/bn256/cloudflare/curve.go
@@ -183,15 +183,24 @@ func (c *curvePoint) Double(a *curvePoint) {
 }
 
 func (c *curvePoint) Mul(a *curvePoint, scalar *big.Int) {
-	sum, t := &curvePoint{}, &curvePoint{}
+	precomp := [1 << 2]*curvePoint{nil, {}, {}, {}}
+	precomp[1].Set(a)
+	precomp[2].Set(a)
+	gfpMul(&precomp[2].x, &precomp[2].x, xiTo2PSquaredMinus2Over3)
+	precomp[3].Add(precomp[1], precomp[2])
+
+	multiScalar := curveLattice.Multi(scalar)
+
+	sum := &curvePoint{}
 	sum.SetInfinity()
+	t := &curvePoint{}
 
-	for i := scalar.BitLen(); i >= 0; i-- {
+	for i := len(multiScalar) - 1; i >= 0; i-- {
 		t.Double(sum)
-		if scalar.Bit(i) != 0 {
-			sum.Add(t, a)
-		} else {
+		if multiScalar[i] == 0 {
 			sum.Set(t)
+		} else {
+			sum.Add(t, precomp[multiScalar[i]])
 		}
 	}
 	c.Set(sum)
diff --git a/crypto/bn256/cloudflare/example_test.go b/crypto/bn256/cloudflare/example_test.go
index 2ee545c67..b2d19807a 100644
--- a/crypto/bn256/cloudflare/example_test.go
+++ b/crypto/bn256/cloudflare/example_test.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64,!appengine,!gccgo
-
 package bn256
 
 import (
diff --git a/crypto/bn256/cloudflare/gfp.h b/crypto/bn256/cloudflare/gfp.h
deleted file mode 100644
index 66f5a4d07..000000000
--- a/crypto/bn256/cloudflare/gfp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#define storeBlock(a0,a1,a2,a3, r) \
-	MOVQ a0,  0+r \
-	MOVQ a1,  8+r \
-	MOVQ a2, 16+r \
-	MOVQ a3, 24+r
-
-#define loadBlock(r, a0,a1,a2,a3) \
-	MOVQ  0+r, a0 \
-	MOVQ  8+r, a1 \
-	MOVQ 16+r, a2 \
-	MOVQ 24+r, a3
-
-#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
-	\ // b = a-p
-	MOVQ a0, b0 \
-	MOVQ a1, b1 \
-	MOVQ a2, b2 \
-	MOVQ a3, b3 \
-	MOVQ a4, b4 \
-	\
-	SUBQ ·p2+0(SB), b0 \
-	SBBQ ·p2+8(SB), b1 \
-	SBBQ ·p2+16(SB), b2 \
-	SBBQ ·p2+24(SB), b3 \
-	SBBQ $0, b4 \
-	\
-	\ // if b is negative then return a
-	\ // else return b
-	CMOVQCC b0, a0 \
-	CMOVQCC b1, a1 \
-	CMOVQCC b2, a2 \
-	CMOVQCC b3, a3
diff --git a/crypto/bn256/cloudflare/gfp_amd64.go b/crypto/bn256/cloudflare/gfp_amd64.go
deleted file mode 100644
index ac4f1a9c6..000000000
--- a/crypto/bn256/cloudflare/gfp_amd64.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// +build amd64,!appengine,!gccgo
-
-package bn256
-
-// go:noescape
-func gfpNeg(c, a *gfP)
-
-//go:noescape
-func gfpAdd(c, a, b *gfP)
-
-//go:noescape
-func gfpSub(c, a, b *gfP)
-
-//go:noescape
-func gfpMul(c, a, b *gfP)
diff --git a/crypto/bn256/cloudflare/gfp_amd64.s b/crypto/bn256/cloudflare/gfp_amd64.s
index 2d0176f2e..3a785d200 100644
--- a/crypto/bn256/cloudflare/gfp_amd64.s
+++ b/crypto/bn256/cloudflare/gfp_amd64.s
@@ -1,8 +1,40 @@
-// +build amd64,!appengine,!gccgo
-
-#include "gfp.h"
-#include "mul.h"
-#include "mul_bmi2.h"
+// +build amd64,!generic
+
+#define storeBlock(a0,a1,a2,a3, r) \
+	MOVQ a0,  0+r \
+	MOVQ a1,  8+r \
+	MOVQ a2, 16+r \
+	MOVQ a3, 24+r
+
+#define loadBlock(r, a0,a1,a2,a3) \
+	MOVQ  0+r, a0 \
+	MOVQ  8+r, a1 \
+	MOVQ 16+r, a2 \
+	MOVQ 24+r, a3
+
+#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
+	\ // b = a-p
+	MOVQ a0, b0 \
+	MOVQ a1, b1 \
+	MOVQ a2, b2 \
+	MOVQ a3, b3 \
+	MOVQ a4, b4 \
+	\
+	SUBQ ·p2+0(SB), b0 \
+	SBBQ ·p2+8(SB), b1 \
+	SBBQ ·p2+16(SB), b2 \
+	SBBQ ·p2+24(SB), b3 \
+	SBBQ $0, b4 \
+	\
+	\ // if b is negative then return a
+	\ // else return b
+	CMOVQCC b0, a0 \
+	CMOVQCC b1, a1 \
+	CMOVQCC b2, a2 \
+	CMOVQCC b3, a3
+
+#include "mul_amd64.h"
+#include "mul_bmi2_amd64.h"
 
 TEXT ·gfpNeg(SB),0,$0-16
 	MOVQ ·p2+0(SB), R8
diff --git a/crypto/bn256/cloudflare/gfp_arm64.s b/crypto/bn256/cloudflare/gfp_arm64.s
new file mode 100644
index 000000000..c65e80168
--- /dev/null
+++ b/crypto/bn256/cloudflare/gfp_arm64.s
@@ -0,0 +1,113 @@
+// +build arm64,!generic
+
+#define storeBlock(a0,a1,a2,a3, r) \
+	MOVD a0,  0+r \
+	MOVD a1,  8+r \
+	MOVD a2, 16+r \
+	MOVD a3, 24+r
+
+#define loadBlock(r, a0,a1,a2,a3) \
+	MOVD  0+r, a0 \
+	MOVD  8+r, a1 \
+	MOVD 16+r, a2 \
+	MOVD 24+r, a3
+
+#define loadModulus(p0,p1,p2,p3) \
+	MOVD ·p2+0(SB), p0 \
+	MOVD ·p2+8(SB), p1 \
+	MOVD ·p2+16(SB), p2 \
+	MOVD ·p2+24(SB), p3
+
+#include "mul_arm64.h"
+
+TEXT ·gfpNeg(SB),0,$0-16
+	MOVD a+8(FP), R0
+	loadBlock(0(R0), R1,R2,R3,R4)
+	loadModulus(R5,R6,R7,R8)
+
+	SUBS R1, R5, R1
+	SBCS R2, R6, R2
+	SBCS R3, R7, R3
+	SBCS R4, R8, R4
+
+	SUBS R5, R1, R5
+	SBCS R6, R2, R6
+	SBCS R7, R3, R7
+	SBCS R8, R4, R8
+
+	CSEL CS, R5, R1, R1
+	CSEL CS, R6, R2, R2
+	CSEL CS, R7, R3, R3
+	CSEL CS, R8, R4, R4
+
+	MOVD c+0(FP), R0
+	storeBlock(R1,R2,R3,R4, 0(R0))
+	RET
+
+TEXT ·gfpAdd(SB),0,$0-24
+	MOVD a+8(FP), R0
+	loadBlock(0(R0), R1,R2,R3,R4)
+	MOVD b+16(FP), R0
+	loadBlock(0(R0), R5,R6,R7,R8)
+	loadModulus(R9,R10,R11,R12)
+	MOVD ZR, R0
+
+	ADDS R5, R1
+	ADCS R6, R2
+	ADCS R7, R3
+	ADCS R8, R4
+	ADCS ZR, R0
+
+	SUBS  R9, R1, R5
+	SBCS R10, R2, R6
+	SBCS R11, R3, R7
+	SBCS R12, R4, R8
+	SBCS  ZR, R0, R0
+
+	CSEL CS, R5, R1, R1
+	CSEL CS, R6, R2, R2
+	CSEL CS, R7, R3, R3
+	CSEL CS, R8, R4, R4
+
+	MOVD c+0(FP), R0
+	storeBlock(R1,R2,R3,R4, 0(R0))
+	RET
+
+TEXT ·gfpSub(SB),0,$0-24
+	MOVD a+8(FP), R0
+	loadBlock(0(R0), R1,R2,R3,R4)
+	MOVD b+16(FP), R0
+	loadBlock(0(R0), R5,R6,R7,R8)
+	loadModulus(R9,R10,R11,R12)
+
+	SUBS R5, R1
+	SBCS R6, R2
+	SBCS R7, R3
+	SBCS R8, R4
+
+	CSEL CS, ZR,  R9,  R9
+	CSEL CS, ZR, R10, R10
+	CSEL CS, ZR, R11, R11
+	CSEL CS, ZR, R12, R12
+
+	ADDS  R9, R1
+	ADCS R10, R2
+	ADCS R11, R3
+	ADCS R12, R4
+
+	MOVD c+0(FP), R0
+	storeBlock(R1,R2,R3,R4, 0(R0))
+	RET
+
+TEXT ·gfpMul(SB),0,$0-24
+	MOVD a+8(FP), R0
+	loadBlock(0(R0), R1,R2,R3,R4)
+	MOVD b+16(FP), R0
+	loadBlock(0(R0), R5,R6,R7,R8)
+
+	mul(R9,R10,R11,R12,R13,R14,R15,R16)
+	gfpReduce()
+
+	MOVD c+0(FP), R0
+	storeBlock(R1,R2,R3,R4, 0(R0))
+	RET
diff --git a/crypto/bn256/cloudflare/gfp_decl.go b/crypto/bn256/cloudflare/gfp_decl.go
new file mode 100644
index 000000000..6a8a4fddb
--- /dev/null
+++ b/crypto/bn256/cloudflare/gfp_decl.go
@@ -0,0 +1,18 @@
+// +build amd64,!generic arm64,!generic
+
+package bn256
+
+// This file contains forward declarations for the architecture-specific
+// assembly implementations of these functions, provided that they exist.
+
+// go:noescape
+func gfpNeg(c, a *gfP)
+
+//go:noescape
+func gfpAdd(c, a, b *gfP)
+
+//go:noescape
+func gfpSub(c, a, b *gfP)
+
+//go:noescape
+func gfpMul(c, a, b *gfP)
diff --git a/crypto/bn256/cloudflare/gfp_generic.go b/crypto/bn256/cloudflare/gfp_generic.go
new file mode 100644
index 000000000..8e6be9596
--- /dev/null
+++ b/crypto/bn256/cloudflare/gfp_generic.go
@@ -0,0 +1,173 @@
+// +build !amd64,!arm64 generic
+
+package bn256
+
+func gfpCarry(a *gfP, head uint64) {
+	b := &gfP{}
+
+	var carry uint64
+	for i, pi := range p2 {
+		ai := a[i]
+		bi := ai - pi - carry
+		b[i] = bi
+		carry = (pi&^ai | (pi|^ai)&bi) >> 63
+	}
+	carry = carry &^ head
+
+	// If b is negative, then return a.
+	// Else return b.
+	carry = -carry
+	ncarry := ^carry
+	for i := 0; i < 4; i++ {
+		a[i] = (a[i] & carry) | (b[i] & ncarry)
+	}
+}
+
+func gfpNeg(c, a *gfP) {
+	var carry uint64
+	for i, pi := range p2 {
+		ai := a[i]
+		ci := pi - ai - carry
+		c[i] = ci
+		carry = (ai&^pi | (ai|^pi)&ci) >> 63
+	}
+	gfpCarry(c, 0)
+}
+
+func gfpAdd(c, a, b *gfP) {
+	var carry uint64
+	for i, ai := range a {
+		bi := b[i]
+		ci := ai + bi + carry
+		c[i] = ci
+		carry = (ai&bi | (ai|bi)&^ci) >> 63
+	}
+	gfpCarry(c, carry)
+}
+
+func gfpSub(c, a, b *gfP) {
+	t := &gfP{}
+
+	var carry uint64
+	for i, pi := range p2 {
+		bi := b[i]
+		ti := pi - bi - carry
+		t[i] = ti
+		carry = (bi&^pi | (bi|^pi)&ti) >> 63
+	}
+
+	carry = 0
+	for i, ai := range a {
+		ti := t[i]
+		ci := ai + ti + carry
+		c[i] = ci
+		carry = (ai&ti | (ai|ti)&^ci) >> 63
+	}
+	gfpCarry(c, carry)
+}
+
+func mul(a, b [4]uint64) [8]uint64 {
+	const (
+		mask16 uint64 = 0x0000ffff
+		mask32 uint64 = 0xffffffff
+	)
+
+	var buff [32]uint64
+	for i, ai := range a {
+		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
+
+		for j, bj := range b {
+			b0, b2 := bj&mask32, bj>>32
+
+			off := 4 * (i + j)
+			buff[off+0] += a0 * b0
+			buff[off+1] += a1 * b0
+			buff[off+2] += a2*b0 + a0*b2
+			buff[off+3] += a3*b0 + a1*b2
+			buff[off+4] += a2 * b2
+			buff[off+5] += a3 * b2
+		}
+	}
+
+	for i := uint(1); i < 4; i++ {
+		shift := 16 * i
+
+		var head, carry uint64
+		for j := uint(0); j < 8; j++ {
+			block := 4 * j
+
+			xi := buff[block]
+			yi := (buff[block+i] << shift) + head
+			zi := xi + yi + carry
+			buff[block] = zi
+			carry = (xi&yi | (xi|yi)&^zi) >> 63
+
+			head = buff[block+i] >> (64 - shift)
+		}
+	}
+
+	return [8]uint64{buff[0], buff[4], buff[8], buff[12], buff[16], buff[20], buff[24], buff[28]}
+}
+
+func halfMul(a, b [4]uint64) [4]uint64 {
+	const (
+		mask16 uint64 = 0x0000ffff
+		mask32 uint64 = 0xffffffff
+	)
+
+	var buff [18]uint64
+	for i, ai := range a {
+		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
+
+		for j, bj := range b {
+			if i+j > 3 {
+				break
+			}
+			b0, b2 := bj&mask32, bj>>32
+
+			off := 4 * (i + j)
+			buff[off+0] += a0 * b0
+			buff[off+1] += a1 * b0
+			buff[off+2] += a2*b0 + a0*b2
+			buff[off+3] += a3*b0 + a1*b2
+			buff[off+4] += a2 * b2
+			buff[off+5] += a3 * b2
+		}
+	}
+
+	for i := uint(1); i < 4; i++ {
+		shift := 16 * i
+
+		var head, carry uint64
+		for j := uint(0); j < 4; j++ {
+			block := 4 * j
+
+			xi := buff[block]
+			yi := (buff[block+i] << shift) + head
+			zi := xi + yi + carry
+			buff[block] = zi
+			carry = (xi&yi | (xi|yi)&^zi) >> 63
+
+			head = buff[block+i] >> (64 - shift)
+		}
+	}
+
+	return [4]uint64{buff[0], buff[4], buff[8], buff[12]}
+}
+
+func gfpMul(c, a, b *gfP) {
+	T := mul(*a, *b)
+	m := halfMul([4]uint64{T[0], T[1], T[2], T[3]}, np)
+	t := mul([4]uint64{m[0], m[1], m[2], m[3]}, p2)
+
+	var carry uint64
+	for i, Ti := range T {
+		ti := t[i]
+		zi := Ti + ti + carry
+		T[i] = zi
+		carry = (Ti&ti | (Ti|ti)&^zi) >> 63
+	}
+
+	*c = gfP{T[4], T[5], T[6], T[7]}
+	gfpCarry(c, carry)
+}
diff --git a/crypto/bn256/cloudflare/gfp_pure.go b/crypto/bn256/cloudflare/gfp_pure.go
deleted file mode 100644
index 8fa5d3053..000000000
--- a/crypto/bn256/cloudflare/gfp_pure.go
+++ /dev/null
@@ -1,19 +0,0 @@
-//  +build !amd64 appengine gccgo
-
-package bn256
-
-func gfpNeg(c, a *gfP) {
-	panic("unsupported architecture")
-}
-
-func gfpAdd(c, a, b *gfP) {
-	panic("unsupported architecture")
-}
-
-func gfpSub(c, a, b *gfP) {
-	panic("unsupported architecture")
-}
-
-func gfpMul(c, a, b *gfP) {
-	panic("unsupported architecture")
-}
diff --git a/crypto/bn256/cloudflare/gfp_test.go b/crypto/bn256/cloudflare/gfp_test.go
index aff5e0531..16ab2a841 100644
--- a/crypto/bn256/cloudflare/gfp_test.go
+++ b/crypto/bn256/cloudflare/gfp_test.go
@@ -1,5 +1,3 @@
-// +build amd64,!appengine,!gccgo
-
 package bn256
 
 import (
diff --git a/crypto/bn256/cloudflare/lattice.go b/crypto/bn256/cloudflare/lattice.go
new file mode 100644
index 000000000..f9ace4d9f
--- /dev/null
+++ b/crypto/bn256/cloudflare/lattice.go
@@ -0,0 +1,115 @@
+package bn256
+
+import (
+	"math/big"
+)
+
+var half = new(big.Int).Rsh(Order, 1)
+
+var curveLattice = &lattice{
+	vectors: [][]*big.Int{
+		{bigFromBase10("147946756881789319000765030803803410728"), bigFromBase10("147946756881789319010696353538189108491")},
+		{bigFromBase10("147946756881789319020627676272574806254"), bigFromBase10("-147946756881789318990833708069417712965")},
+	},
+	inverse: []*big.Int{
+		bigFromBase10("147946756881789318990833708069417712965"),
+		bigFromBase10("147946756881789319010696353538189108491"),
+	},
+	det: bigFromBase10("43776485743678550444492811490514550177096728800832068687396408373151616991234"),
+}
+
+var targetLattice = &lattice{
+	vectors: [][]*big.Int{
+		{bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697763"), bigFromBase10("9931322734385697764")},
+		{bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848882"), bigFromBase10("-9931322734385697762")},
+		{bigFromBase10("-9931322734385697762"), bigFromBase10("-4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("-4965661367192848882")},
+		{bigFromBase10("9931322734385697763"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881")},
+	},
+	inverse: []*big.Int{
+		bigFromBase10("734653495049373973658254490726798021314063399421879442165"),
+		bigFromBase10("147946756881789319000765030803803410728"),
+		bigFromBase10("-147946756881789319005730692170996259609"),
+		bigFromBase10("1469306990098747947464455738335385361643788813749140841702"),
+	},
+	det: new(big.Int).Set(Order),
+}
+
+type lattice struct {
+	vectors [][]*big.Int
+	inverse []*big.Int
+	det     *big.Int
+}
+
+// decompose takes a scalar mod Order as input and finds a short, positive decomposition of it wrt to the lattice basis.
+func (l *lattice) decompose(k *big.Int) []*big.Int {
+	n := len(l.inverse)
+
+	// Calculate closest vector in lattice to <k,0,0,...> with Babai's rounding.
+	c := make([]*big.Int, n)
+	for i := 0; i < n; i++ {
+		c[i] = new(big.Int).Mul(k, l.inverse[i])
+		round(c[i], l.det)
+	}
+
+	// Transform vectors according to c and subtract <k,0,0,...>.
+	out := make([]*big.Int, n)
+	temp := new(big.Int)
+
+	for i := 0; i < n; i++ {
+		out[i] = new(big.Int)
+
+		for j := 0; j < n; j++ {
+			temp.Mul(c[j], l.vectors[j][i])
+			out[i].Add(out[i], temp)
+		}
+
+		out[i].Neg(out[i])
+		out[i].Add(out[i], l.vectors[0][i]).Add(out[i], l.vectors[0][i])
+	}
+	out[0].Add(out[0], k)
+
+	return out
+}
+
+func (l *lattice) Precompute(add func(i, j uint)) {
+	n := uint(len(l.vectors))
+	total := uint(1) << n
+
+	for i := uint(0); i < n; i++ {
+		for j := uint(0); j < total; j++ {
+			if (j>>i)&1 == 1 {
+				add(i, j)
+			}
+		}
+	}
+}
+
+func (l *lattice) Multi(scalar *big.Int) []uint8 {
+	decomp := l.decompose(scalar)
+
+	maxLen := 0
+	for _, x := range decomp {
+		if x.BitLen() > maxLen {
+			maxLen = x.BitLen()
+		}
+	}
+
+	out := make([]uint8, maxLen)
+	for j, x := range decomp {
+		for i := 0; i < maxLen; i++ {
+			out[i] += uint8(x.Bit(i)) << uint(j)
+		}
+	}
+
+	return out
+}
+
+// round sets num to num/denom rounded to the nearest integer.
+func round(num, denom *big.Int) {
+	r := new(big.Int)
+	num.DivMod(num, denom, r)
+
+	if r.Cmp(half) == 1 {
+		num.Add(num, big.NewInt(1))
+	}
+}
diff --git a/crypto/bn256/cloudflare/lattice_test.go b/crypto/bn256/cloudflare/lattice_test.go
new file mode 100644
index 000000000..4d52ad9b2
--- /dev/null
+++ b/crypto/bn256/cloudflare/lattice_test.go
@@ -0,0 +1,29 @@
+package bn256
+
+import (
+	"crypto/rand"
+
+	"testing"
+)
+
+func TestLatticeReduceCurve(t *testing.T) {
+	k, _ := rand.Int(rand.Reader, Order)
+	ks := curveLattice.decompose(k)
+
+	if ks[0].BitLen() > 130 || ks[1].BitLen() > 130 {
+		t.Fatal("reduction too large")
+	} else if ks[0].Sign() < 0 || ks[1].Sign() < 0 {
+		t.Fatal("reduction must be positive")
+	}
+}
+
+func TestLatticeReduceTarget(t *testing.T) {
+	k, _ := rand.Int(rand.Reader, Order)
+	ks := targetLattice.decompose(k)
+
+	if ks[0].BitLen() > 66 || ks[1].BitLen() > 66 || ks[2].BitLen() > 66 || ks[3].BitLen() > 66 {
+		t.Fatal("reduction too large")
+	} else if ks[0].Sign() < 0 || ks[1].Sign() < 0 || ks[2].Sign() < 0 || ks[3].Sign() < 0 {
+		t.Fatal("reduction must be positive")
+	}
+}
diff --git a/crypto/bn256/cloudflare/main_test.go b/crypto/bn256/cloudflare/main_test.go
index f0d59a404..0230f1b19 100644
--- a/crypto/bn256/cloudflare/main_test.go
+++ b/crypto/bn256/cloudflare/main_test.go
@@ -1,5 +1,3 @@
-// +build amd64,!appengine,!gccgo
-
 package bn256
 
 import (
diff --git a/crypto/bn256/cloudflare/mul.h b/crypto/bn256/cloudflare/mul_amd64.h
index bab5da831..bab5da831 100644
--- a/crypto/bn256/cloudflare/mul.h
+++ b/crypto/bn256/cloudflare/mul_amd64.h
diff --git a/crypto/bn256/cloudflare/mul_arm64.h b/crypto/bn256/cloudflare/mul_arm64.h
new file mode 100644
index 000000000..75d522173
--- /dev/null
+++ b/crypto/bn256/cloudflare/mul_arm64.h
@@ -0,0 +1,133 @@
+#define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
+	MUL R1, R5, c0 \
+	UMULH R1, R5, c1 \
+	MUL R1, R6, R0 \
+	ADDS R0, c1 \
+	UMULH R1, R6, c2 \
+	MUL R1, R7, R0 \
+	ADCS R0, c2 \
+	UMULH R1, R7, c3 \
+	MUL R1, R8, R0 \
+	ADCS R0, c3 \
+	UMULH R1, R8, c4 \
+	ADCS ZR, c4 \
+	\
+	MUL R2, R5, R25 \
+	UMULH R2, R5, R26 \
+	MUL R2, R6, R0 \
+	ADDS R0, R26 \
+	UMULH R2, R6, R27 \
+	MUL R2, R7, R0 \
+	ADCS R0, R27 \
+	UMULH R2, R7, R29 \
+	MUL R2, R8, R0 \
+	ADCS R0, R29 \
+	UMULH R2, R8, c5 \
+	ADCS ZR, c5 \
+	ADDS R25, c1 \
+	ADCS R26, c2 \
+	ADCS R27, c3 \
+	ADCS R29, c4 \
+	ADCS  ZR, c5 \
+	\
+	MUL R3, R5, R25 \
+	UMULH R3, R5, R26 \
+	MUL R3, R6, R0 \
+	ADDS R0, R26 \
+	UMULH R3, R6, R27 \
+	MUL R3, R7, R0 \
+	ADCS R0, R27 \
+	UMULH R3, R7, R29 \
+	MUL R3, R8, R0 \
+	ADCS R0, R29 \
+	UMULH R3, R8, c6 \
+	ADCS ZR, c6 \
+	ADDS R25, c2 \
+	ADCS R26, c3 \
+	ADCS R27, c4 \
+	ADCS R29, c5 \
+	ADCS  ZR, c6 \
+	\
+	MUL R4, R5, R25 \
+	UMULH R4, R5, R26 \
+	MUL R4, R6, R0 \
+	ADDS R0, R26 \
+	UMULH R4, R6, R27 \
+	MUL R4, R7, R0 \
+	ADCS R0, R27 \
+	UMULH R4, R7, R29 \
+	MUL R4, R8, R0 \
+	ADCS R0, R29 \
+	UMULH R4, R8, c7 \
+	ADCS ZR, c7 \
+	ADDS R25, c3 \
+	ADCS R26, c4 \
+	ADCS R27, c5 \
+	ADCS R29, c6 \
+	ADCS  ZR, c7
+
+#define gfpReduce() \
+	\ // m = (T * N') mod R, store m in R1:R2:R3:R4
+	MOVD ·np+0(SB), R17 \
+	MOVD ·np+8(SB), R18 \
+	MOVD ·np+16(SB), R19 \
+	MOVD ·np+24(SB), R20 \
+	\
+	MUL R9, R17, R1 \
+	UMULH R9, R17, R2 \
+	MUL R9, R18, R0 \
+	ADDS R0, R2 \
+	UMULH R9, R18, R3 \
+	MUL R9, R19, R0 \
+	ADCS R0, R3 \
+	UMULH R9, R19, R4 \
+	MUL R9, R20, R0 \
+	ADCS R0, R4 \
+	\
+	MUL R10, R17, R21 \
+	UMULH R10, R17, R22 \
+	MUL R10, R18, R0 \
+	ADDS R0, R22 \
+	UMULH R10, R18, R23 \
+	MUL R10, R19, R0 \
+	ADCS R0, R23 \
+	ADDS R21, R2 \
+	ADCS R22, R3 \
+	ADCS R23, R4 \
+	\
+	MUL R11, R17, R21 \
+	UMULH R11, R17, R22 \
+	MUL R11, R18, R0 \
+	ADDS R0, R22 \
+	ADDS R21, R3 \
+	ADCS R22, R4 \
+	\
+	MUL R12, R17, R21 \
+	ADDS R21, R4 \
+	\
+	\ // m * N
+	loadModulus(R5,R6,R7,R8) \
+	mul(R17,R18,R19,R20,R21,R22,R23,R24) \
+	\
+	\ // Add the 512-bit intermediate to m*N
+	MOVD  ZR, R25 \
+	ADDS  R9, R17 \
+	ADCS R10, R18 \
+	ADCS R11, R19 \
+	ADCS R12, R20 \
+	ADCS R13, R21 \
+	ADCS R14, R22 \
+	ADCS R15, R23 \
+	ADCS R16, R24 \
+	ADCS  ZR, R25 \
+	\
+	\ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
+	SUBS R5, R21, R10 \
+	SBCS R6, R22, R11 \
+	SBCS R7, R23, R12 \
+	SBCS R8, R24, R13 \
+	\
+	CSEL CS, R10, R21, R1 \
+	CSEL CS, R11, R22, R2 \
+	CSEL CS, R12, R23, R3 \
+	CSEL CS, R13, R24, R4
diff --git a/crypto/bn256/cloudflare/mul_bmi2.h b/crypto/bn256/cloudflare/mul_bmi2_amd64.h
index 71ad0499a..71ad0499a 100644
--- a/crypto/bn256/cloudflare/mul_bmi2.h
+++ b/crypto/bn256/cloudflare/mul_bmi2_amd64.h