From 1203c6a237cb87b78ec495772cecb178200499ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Tue, 20 Mar 2018 01:13:54 +0900 Subject: crypto/bn256: full switchover to cloudflare's code (#16301) * crypto/bn256: full switchover to cloudflare's code * crypto/bn256: only use cloudflare for optimized architectures * crypto/bn256: upstream fallback for non-optimized code * .travis, build: drop support for Go 1.8 (need type aliases) * crypto/bn256/cloudflare: enable curve mul lattice optimization --- crypto/bn256/bn256_amd64.go | 63 ----------- crypto/bn256/bn256_fast.go | 35 ++++++ crypto/bn256/bn256_fuzz.go | 138 +++++++++++++++++++++++ crypto/bn256/bn256_other.go | 63 ----------- crypto/bn256/bn256_slow.go | 35 ++++++ crypto/bn256/cloudflare/bn256_test.go | 2 - crypto/bn256/cloudflare/curve.go | 19 +++- crypto/bn256/cloudflare/example_test.go | 2 - crypto/bn256/cloudflare/gfp.h | 32 ------ crypto/bn256/cloudflare/gfp_amd64.go | 15 --- crypto/bn256/cloudflare/gfp_amd64.s | 42 ++++++- crypto/bn256/cloudflare/gfp_arm64.s | 113 +++++++++++++++++++ crypto/bn256/cloudflare/gfp_decl.go | 18 +++ crypto/bn256/cloudflare/gfp_generic.go | 173 +++++++++++++++++++++++++++++ crypto/bn256/cloudflare/gfp_pure.go | 19 ---- crypto/bn256/cloudflare/gfp_test.go | 2 - crypto/bn256/cloudflare/lattice.go | 115 ++++++++++++++++++++ crypto/bn256/cloudflare/lattice_test.go | 29 +++++ crypto/bn256/cloudflare/main_test.go | 2 - crypto/bn256/cloudflare/mul.h | 181 ------------------------------- crypto/bn256/cloudflare/mul_amd64.h | 181 +++++++++++++++++++++++++++++++ crypto/bn256/cloudflare/mul_arm64.h | 133 +++++++++++++++++++++++ crypto/bn256/cloudflare/mul_bmi2.h | 112 ------------------- crypto/bn256/cloudflare/mul_bmi2_amd64.h | 112 +++++++++++++++++++ 24 files changed, 1133 insertions(+), 503 deletions(-) delete mode 100644 crypto/bn256/bn256_amd64.go create mode 100644 crypto/bn256/bn256_fast.go create mode 100644 crypto/bn256/bn256_fuzz.go delete mode 100644 crypto/bn256/bn256_other.go create mode 100644 crypto/bn256/bn256_slow.go delete mode 100644 crypto/bn256/cloudflare/gfp.h delete mode 100644 crypto/bn256/cloudflare/gfp_amd64.go create mode 100644 crypto/bn256/cloudflare/gfp_arm64.s create mode 100644 crypto/bn256/cloudflare/gfp_decl.go create mode 100644 crypto/bn256/cloudflare/gfp_generic.go delete mode 100644 crypto/bn256/cloudflare/gfp_pure.go create mode 100644 crypto/bn256/cloudflare/lattice.go create mode 100644 crypto/bn256/cloudflare/lattice_test.go delete mode 100644 crypto/bn256/cloudflare/mul.h create mode 100644 crypto/bn256/cloudflare/mul_amd64.h create mode 100644 crypto/bn256/cloudflare/mul_arm64.h delete mode 100644 crypto/bn256/cloudflare/mul_bmi2.h create mode 100644 crypto/bn256/cloudflare/mul_bmi2_amd64.h (limited to 'crypto') diff --git a/crypto/bn256/bn256_amd64.go b/crypto/bn256/bn256_amd64.go deleted file mode 100644 index 35b4839c2..000000000 --- a/crypto/bn256/bn256_amd64.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2018 The go-ethereum Authors -// This file is part of the go-ethereum library. -// -// The go-ethereum library is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// The go-ethereum library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with the go-ethereum library. If not, see . - -// +build amd64,!appengine,!gccgo - -// Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve. -package bn256 - -import ( - "math/big" - - "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare" -) - -// G1 is an abstract cyclic group. The zero value is suitable for use as the -// output of an operation, but cannot be used as an input. -type G1 struct { - bn256.G1 -} - -// Add sets e to a+b and then returns e. -func (e *G1) Add(a, b *G1) *G1 { - e.G1.Add(&a.G1, &b.G1) - return e -} - -// ScalarMult sets e to a*k and then returns e. -func (e *G1) ScalarMult(a *G1, k *big.Int) *G1 { - e.G1.ScalarMult(&a.G1, k) - return e -} - -// G2 is an abstract cyclic group. The zero value is suitable for use as the -// output of an operation, but cannot be used as an input. -type G2 struct { - bn256.G2 -} - -// PairingCheck calculates the Optimal Ate pairing for a set of points. -func PairingCheck(a []*G1, b []*G2) bool { - as := make([]*bn256.G1, len(a)) - for i, p := range a { - as[i] = &p.G1 - } - bs := make([]*bn256.G2, len(b)) - for i, p := range b { - bs[i] = &p.G2 - } - return bn256.PairingCheck(as, bs) -} diff --git a/crypto/bn256/bn256_fast.go b/crypto/bn256/bn256_fast.go new file mode 100644 index 000000000..a8dfa8f67 --- /dev/null +++ b/crypto/bn256/bn256_fast.go @@ -0,0 +1,35 @@ +// Copyright 2018 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +// +build amd64 arm64 + +// Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve. +package bn256 + +import "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare" + +// G1 is an abstract cyclic group. The zero value is suitable for use as the +// output of an operation, but cannot be used as an input. +type G1 = bn256.G1 + +// G2 is an abstract cyclic group. The zero value is suitable for use as the +// output of an operation, but cannot be used as an input. +type G2 = bn256.G2 + +// PairingCheck calculates the Optimal Ate pairing for a set of points. +func PairingCheck(a []*G1, b []*G2) bool { + return bn256.PairingCheck(a, b) +} diff --git a/crypto/bn256/bn256_fuzz.go b/crypto/bn256/bn256_fuzz.go new file mode 100644 index 000000000..f360b0541 --- /dev/null +++ b/crypto/bn256/bn256_fuzz.go @@ -0,0 +1,138 @@ +// Copyright 2018 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +// +build gofuzz + +package bn256 + +import ( + "bytes" + "math/big" + + cloudflare "github.com/ethereum/go-ethereum/crypto/bn256/cloudflare" + google "github.com/ethereum/go-ethereum/crypto/bn256/google" +) + +// FuzzAdd fuzzez bn256 addition between the Google and Cloudflare libraries. +func FuzzAdd(data []byte) int { + // Ensure we have enough data in the first place + if len(data) != 128 { + return 0 + } + // Ensure both libs can parse the first curve point + xc := new(cloudflare.G1) + _, errc := xc.Unmarshal(data[:64]) + + xg := new(google.G1) + _, errg := xg.Unmarshal(data[:64]) + + if (errc == nil) != (errg == nil) { + panic("parse mismatch") + } else if errc != nil { + return 0 + } + // Ensure both libs can parse the second curve point + yc := new(cloudflare.G1) + _, errc = yc.Unmarshal(data[64:]) + + yg := new(google.G1) + _, errg = yg.Unmarshal(data[64:]) + + if (errc == nil) != (errg == nil) { + panic("parse mismatch") + } else if errc != nil { + return 0 + } + // Add the two points and ensure they result in the same output + rc := new(cloudflare.G1) + rc.Add(xc, yc) + + rg := new(google.G1) + rg.Add(xg, yg) + + if !bytes.Equal(rc.Marshal(), rg.Marshal()) { + panic("add mismatch") + } + return 0 +} + +// FuzzMul fuzzez bn256 scalar multiplication between the Google and Cloudflare +// libraries. +func FuzzMul(data []byte) int { + // Ensure we have enough data in the first place + if len(data) != 96 { + return 0 + } + // Ensure both libs can parse the curve point + pc := new(cloudflare.G1) + _, errc := pc.Unmarshal(data[:64]) + + pg := new(google.G1) + _, errg := pg.Unmarshal(data[:64]) + + if (errc == nil) != (errg == nil) { + panic("parse mismatch") + } else if errc != nil { + return 0 + } + // Add the two points and ensure they result in the same output + rc := new(cloudflare.G1) + rc.ScalarMult(pc, new(big.Int).SetBytes(data[64:])) + + rg := new(google.G1) + rg.ScalarMult(pg, new(big.Int).SetBytes(data[64:])) + + if !bytes.Equal(rc.Marshal(), rg.Marshal()) { + panic("scalar mul mismatch") + } + return 0 +} + +func FuzzPair(data []byte) int { + // Ensure we have enough data in the first place + if len(data) != 192 { + return 0 + } + // Ensure both libs can parse the curve point + pc := new(cloudflare.G1) + _, errc := pc.Unmarshal(data[:64]) + + pg := new(google.G1) + _, errg := pg.Unmarshal(data[:64]) + + if (errc == nil) != (errg == nil) { + panic("parse mismatch") + } else if errc != nil { + return 0 + } + // Ensure both libs can parse the twist point + tc := new(cloudflare.G2) + _, errc = tc.Unmarshal(data[64:]) + + tg := new(google.G2) + _, errg = tg.Unmarshal(data[64:]) + + if (errc == nil) != (errg == nil) { + panic("parse mismatch") + } else if errc != nil { + return 0 + } + // Pair the two points and ensure thet result in the same output + if cloudflare.PairingCheck([]*cloudflare.G1{pc}, []*cloudflare.G2{tc}) != google.PairingCheck([]*google.G1{pg}, []*google.G2{tg}) { + panic("pair mismatch") + } + return 0 +} diff --git a/crypto/bn256/bn256_other.go b/crypto/bn256/bn256_other.go deleted file mode 100644 index 81977a0a8..000000000 --- a/crypto/bn256/bn256_other.go +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2018 The go-ethereum Authors -// This file is part of the go-ethereum library. -// -// The go-ethereum library is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// The go-ethereum library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with the go-ethereum library. If not, see . - -// +build !amd64 appengine gccgo - -// Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve. -package bn256 - -import ( - "math/big" - - "github.com/ethereum/go-ethereum/crypto/bn256/google" -) - -// G1 is an abstract cyclic group. The zero value is suitable for use as the -// output of an operation, but cannot be used as an input. -type G1 struct { - bn256.G1 -} - -// Add sets e to a+b and then returns e. -func (e *G1) Add(a, b *G1) *G1 { - e.G1.Add(&a.G1, &b.G1) - return e -} - -// ScalarMult sets e to a*k and then returns e. -func (e *G1) ScalarMult(a *G1, k *big.Int) *G1 { - e.G1.ScalarMult(&a.G1, k) - return e -} - -// G2 is an abstract cyclic group. The zero value is suitable for use as the -// output of an operation, but cannot be used as an input. -type G2 struct { - bn256.G2 -} - -// PairingCheck calculates the Optimal Ate pairing for a set of points. -func PairingCheck(a []*G1, b []*G2) bool { - as := make([]*bn256.G1, len(a)) - for i, p := range a { - as[i] = &p.G1 - } - bs := make([]*bn256.G2, len(b)) - for i, p := range b { - bs[i] = &p.G2 - } - return bn256.PairingCheck(as, bs) -} diff --git a/crypto/bn256/bn256_slow.go b/crypto/bn256/bn256_slow.go new file mode 100644 index 000000000..61373763b --- /dev/null +++ b/crypto/bn256/bn256_slow.go @@ -0,0 +1,35 @@ +// Copyright 2018 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +// +build !amd64,!arm64 + +// Package bn256 implements the Optimal Ate pairing over a 256-bit Barreto-Naehrig curve. +package bn256 + +import "github.com/ethereum/go-ethereum/crypto/bn256/google" + +// G1 is an abstract cyclic group. The zero value is suitable for use as the +// output of an operation, but cannot be used as an input. +type G1 = bn256.G1 + +// G2 is an abstract cyclic group. The zero value is suitable for use as the +// output of an operation, but cannot be used as an input. +type G2 = bn256.G2 + +// PairingCheck calculates the Optimal Ate pairing for a set of points. +func PairingCheck(a []*G1, b []*G2) bool { + return bn256.PairingCheck(a, b) +} diff --git a/crypto/bn256/cloudflare/bn256_test.go b/crypto/bn256/cloudflare/bn256_test.go index 369a3edaa..0c8016d86 100644 --- a/crypto/bn256/cloudflare/bn256_test.go +++ b/crypto/bn256/cloudflare/bn256_test.go @@ -1,5 +1,3 @@ -// +build amd64,!appengine,!gccgo - package bn256 import ( diff --git a/crypto/bn256/cloudflare/curve.go b/crypto/bn256/cloudflare/curve.go index b6aecc0a6..18e9b38f3 100644 --- a/crypto/bn256/cloudflare/curve.go +++ b/crypto/bn256/cloudflare/curve.go @@ -183,15 +183,24 @@ func (c *curvePoint) Double(a *curvePoint) { } func (c *curvePoint) Mul(a *curvePoint, scalar *big.Int) { - sum, t := &curvePoint{}, &curvePoint{} + precomp := [1 << 2]*curvePoint{nil, {}, {}, {}} + precomp[1].Set(a) + precomp[2].Set(a) + gfpMul(&precomp[2].x, &precomp[2].x, xiTo2PSquaredMinus2Over3) + precomp[3].Add(precomp[1], precomp[2]) + + multiScalar := curveLattice.Multi(scalar) + + sum := &curvePoint{} sum.SetInfinity() + t := &curvePoint{} - for i := scalar.BitLen(); i >= 0; i-- { + for i := len(multiScalar) - 1; i >= 0; i-- { t.Double(sum) - if scalar.Bit(i) != 0 { - sum.Add(t, a) - } else { + if multiScalar[i] == 0 { sum.Set(t) + } else { + sum.Add(t, precomp[multiScalar[i]]) } } c.Set(sum) diff --git a/crypto/bn256/cloudflare/example_test.go b/crypto/bn256/cloudflare/example_test.go index 2ee545c67..b2d19807a 100644 --- a/crypto/bn256/cloudflare/example_test.go +++ b/crypto/bn256/cloudflare/example_test.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64,!appengine,!gccgo - package bn256 import ( diff --git a/crypto/bn256/cloudflare/gfp.h b/crypto/bn256/cloudflare/gfp.h deleted file mode 100644 index 66f5a4d07..000000000 --- a/crypto/bn256/cloudflare/gfp.h +++ /dev/null @@ -1,32 +0,0 @@ -#define storeBlock(a0,a1,a2,a3, r) \ - MOVQ a0, 0+r \ - MOVQ a1, 8+r \ - MOVQ a2, 16+r \ - MOVQ a3, 24+r - -#define loadBlock(r, a0,a1,a2,a3) \ - MOVQ 0+r, a0 \ - MOVQ 8+r, a1 \ - MOVQ 16+r, a2 \ - MOVQ 24+r, a3 - -#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \ - \ // b = a-p - MOVQ a0, b0 \ - MOVQ a1, b1 \ - MOVQ a2, b2 \ - MOVQ a3, b3 \ - MOVQ a4, b4 \ - \ - SUBQ ·p2+0(SB), b0 \ - SBBQ ·p2+8(SB), b1 \ - SBBQ ·p2+16(SB), b2 \ - SBBQ ·p2+24(SB), b3 \ - SBBQ $0, b4 \ - \ - \ // if b is negative then return a - \ // else return b - CMOVQCC b0, a0 \ - CMOVQCC b1, a1 \ - CMOVQCC b2, a2 \ - CMOVQCC b3, a3 diff --git a/crypto/bn256/cloudflare/gfp_amd64.go b/crypto/bn256/cloudflare/gfp_amd64.go deleted file mode 100644 index ac4f1a9c6..000000000 --- a/crypto/bn256/cloudflare/gfp_amd64.go +++ /dev/null @@ -1,15 +0,0 @@ -// +build amd64,!appengine,!gccgo - -package bn256 - -// go:noescape -func gfpNeg(c, a *gfP) - -//go:noescape -func gfpAdd(c, a, b *gfP) - -//go:noescape -func gfpSub(c, a, b *gfP) - -//go:noescape -func gfpMul(c, a, b *gfP) diff --git a/crypto/bn256/cloudflare/gfp_amd64.s b/crypto/bn256/cloudflare/gfp_amd64.s index 2d0176f2e..3a785d200 100644 --- a/crypto/bn256/cloudflare/gfp_amd64.s +++ b/crypto/bn256/cloudflare/gfp_amd64.s @@ -1,8 +1,40 @@ -// +build amd64,!appengine,!gccgo - -#include "gfp.h" -#include "mul.h" -#include "mul_bmi2.h" +// +build amd64,!generic + +#define storeBlock(a0,a1,a2,a3, r) \ + MOVQ a0, 0+r \ + MOVQ a1, 8+r \ + MOVQ a2, 16+r \ + MOVQ a3, 24+r + +#define loadBlock(r, a0,a1,a2,a3) \ + MOVQ 0+r, a0 \ + MOVQ 8+r, a1 \ + MOVQ 16+r, a2 \ + MOVQ 24+r, a3 + +#define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \ + \ // b = a-p + MOVQ a0, b0 \ + MOVQ a1, b1 \ + MOVQ a2, b2 \ + MOVQ a3, b3 \ + MOVQ a4, b4 \ + \ + SUBQ ·p2+0(SB), b0 \ + SBBQ ·p2+8(SB), b1 \ + SBBQ ·p2+16(SB), b2 \ + SBBQ ·p2+24(SB), b3 \ + SBBQ $0, b4 \ + \ + \ // if b is negative then return a + \ // else return b + CMOVQCC b0, a0 \ + CMOVQCC b1, a1 \ + CMOVQCC b2, a2 \ + CMOVQCC b3, a3 + +#include "mul_amd64.h" +#include "mul_bmi2_amd64.h" TEXT ·gfpNeg(SB),0,$0-16 MOVQ ·p2+0(SB), R8 diff --git a/crypto/bn256/cloudflare/gfp_arm64.s b/crypto/bn256/cloudflare/gfp_arm64.s new file mode 100644 index 000000000..c65e80168 --- /dev/null +++ b/crypto/bn256/cloudflare/gfp_arm64.s @@ -0,0 +1,113 @@ +// +build arm64,!generic + +#define storeBlock(a0,a1,a2,a3, r) \ + MOVD a0, 0+r \ + MOVD a1, 8+r \ + MOVD a2, 16+r \ + MOVD a3, 24+r + +#define loadBlock(r, a0,a1,a2,a3) \ + MOVD 0+r, a0 \ + MOVD 8+r, a1 \ + MOVD 16+r, a2 \ + MOVD 24+r, a3 + +#define loadModulus(p0,p1,p2,p3) \ + MOVD ·p2+0(SB), p0 \ + MOVD ·p2+8(SB), p1 \ + MOVD ·p2+16(SB), p2 \ + MOVD ·p2+24(SB), p3 + +#include "mul_arm64.h" + +TEXT ·gfpNeg(SB),0,$0-16 + MOVD a+8(FP), R0 + loadBlock(0(R0), R1,R2,R3,R4) + loadModulus(R5,R6,R7,R8) + + SUBS R1, R5, R1 + SBCS R2, R6, R2 + SBCS R3, R7, R3 + SBCS R4, R8, R4 + + SUBS R5, R1, R5 + SBCS R6, R2, R6 + SBCS R7, R3, R7 + SBCS R8, R4, R8 + + CSEL CS, R5, R1, R1 + CSEL CS, R6, R2, R2 + CSEL CS, R7, R3, R3 + CSEL CS, R8, R4, R4 + + MOVD c+0(FP), R0 + storeBlock(R1,R2,R3,R4, 0(R0)) + RET + +TEXT ·gfpAdd(SB),0,$0-24 + MOVD a+8(FP), R0 + loadBlock(0(R0), R1,R2,R3,R4) + MOVD b+16(FP), R0 + loadBlock(0(R0), R5,R6,R7,R8) + loadModulus(R9,R10,R11,R12) + MOVD ZR, R0 + + ADDS R5, R1 + ADCS R6, R2 + ADCS R7, R3 + ADCS R8, R4 + ADCS ZR, R0 + + SUBS R9, R1, R5 + SBCS R10, R2, R6 + SBCS R11, R3, R7 + SBCS R12, R4, R8 + SBCS ZR, R0, R0 + + CSEL CS, R5, R1, R1 + CSEL CS, R6, R2, R2 + CSEL CS, R7, R3, R3 + CSEL CS, R8, R4, R4 + + MOVD c+0(FP), R0 + storeBlock(R1,R2,R3,R4, 0(R0)) + RET + +TEXT ·gfpSub(SB),0,$0-24 + MOVD a+8(FP), R0 + loadBlock(0(R0), R1,R2,R3,R4) + MOVD b+16(FP), R0 + loadBlock(0(R0), R5,R6,R7,R8) + loadModulus(R9,R10,R11,R12) + + SUBS R5, R1 + SBCS R6, R2 + SBCS R7, R3 + SBCS R8, R4 + + CSEL CS, ZR, R9, R9 + CSEL CS, ZR, R10, R10 + CSEL CS, ZR, R11, R11 + CSEL CS, ZR, R12, R12 + + ADDS R9, R1 + ADCS R10, R2 + ADCS R11, R3 + ADCS R12, R4 + + MOVD c+0(FP), R0 + storeBlock(R1,R2,R3,R4, 0(R0)) + RET + +TEXT ·gfpMul(SB),0,$0-24 + MOVD a+8(FP), R0 + loadBlock(0(R0), R1,R2,R3,R4) + MOVD b+16(FP), R0 + loadBlock(0(R0), R5,R6,R7,R8) + + mul(R9,R10,R11,R12,R13,R14,R15,R16) + gfpReduce() + + MOVD c+0(FP), R0 + storeBlock(R1,R2,R3,R4, 0(R0)) + RET diff --git a/crypto/bn256/cloudflare/gfp_decl.go b/crypto/bn256/cloudflare/gfp_decl.go new file mode 100644 index 000000000..6a8a4fddb --- /dev/null +++ b/crypto/bn256/cloudflare/gfp_decl.go @@ -0,0 +1,18 @@ +// +build amd64,!generic arm64,!generic + +package bn256 + +// This file contains forward declarations for the architecture-specific +// assembly implementations of these functions, provided that they exist. + +// go:noescape +func gfpNeg(c, a *gfP) + +//go:noescape +func gfpAdd(c, a, b *gfP) + +//go:noescape +func gfpSub(c, a, b *gfP) + +//go:noescape +func gfpMul(c, a, b *gfP) diff --git a/crypto/bn256/cloudflare/gfp_generic.go b/crypto/bn256/cloudflare/gfp_generic.go new file mode 100644 index 000000000..8e6be9596 --- /dev/null +++ b/crypto/bn256/cloudflare/gfp_generic.go @@ -0,0 +1,173 @@ +// +build !amd64,!arm64 generic + +package bn256 + +func gfpCarry(a *gfP, head uint64) { + b := &gfP{} + + var carry uint64 + for i, pi := range p2 { + ai := a[i] + bi := ai - pi - carry + b[i] = bi + carry = (pi&^ai | (pi|^ai)&bi) >> 63 + } + carry = carry &^ head + + // If b is negative, then return a. + // Else return b. + carry = -carry + ncarry := ^carry + for i := 0; i < 4; i++ { + a[i] = (a[i] & carry) | (b[i] & ncarry) + } +} + +func gfpNeg(c, a *gfP) { + var carry uint64 + for i, pi := range p2 { + ai := a[i] + ci := pi - ai - carry + c[i] = ci + carry = (ai&^pi | (ai|^pi)&ci) >> 63 + } + gfpCarry(c, 0) +} + +func gfpAdd(c, a, b *gfP) { + var carry uint64 + for i, ai := range a { + bi := b[i] + ci := ai + bi + carry + c[i] = ci + carry = (ai&bi | (ai|bi)&^ci) >> 63 + } + gfpCarry(c, carry) +} + +func gfpSub(c, a, b *gfP) { + t := &gfP{} + + var carry uint64 + for i, pi := range p2 { + bi := b[i] + ti := pi - bi - carry + t[i] = ti + carry = (bi&^pi | (bi|^pi)&ti) >> 63 + } + + carry = 0 + for i, ai := range a { + ti := t[i] + ci := ai + ti + carry + c[i] = ci + carry = (ai&ti | (ai|ti)&^ci) >> 63 + } + gfpCarry(c, carry) +} + +func mul(a, b [4]uint64) [8]uint64 { + const ( + mask16 uint64 = 0x0000ffff + mask32 uint64 = 0xffffffff + ) + + var buff [32]uint64 + for i, ai := range a { + a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48 + + for j, bj := range b { + b0, b2 := bj&mask32, bj>>32 + + off := 4 * (i + j) + buff[off+0] += a0 * b0 + buff[off+1] += a1 * b0 + buff[off+2] += a2*b0 + a0*b2 + buff[off+3] += a3*b0 + a1*b2 + buff[off+4] += a2 * b2 + buff[off+5] += a3 * b2 + } + } + + for i := uint(1); i < 4; i++ { + shift := 16 * i + + var head, carry uint64 + for j := uint(0); j < 8; j++ { + block := 4 * j + + xi := buff[block] + yi := (buff[block+i] << shift) + head + zi := xi + yi + carry + buff[block] = zi + carry = (xi&yi | (xi|yi)&^zi) >> 63 + + head = buff[block+i] >> (64 - shift) + } + } + + return [8]uint64{buff[0], buff[4], buff[8], buff[12], buff[16], buff[20], buff[24], buff[28]} +} + +func halfMul(a, b [4]uint64) [4]uint64 { + const ( + mask16 uint64 = 0x0000ffff + mask32 uint64 = 0xffffffff + ) + + var buff [18]uint64 + for i, ai := range a { + a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48 + + for j, bj := range b { + if i+j > 3 { + break + } + b0, b2 := bj&mask32, bj>>32 + + off := 4 * (i + j) + buff[off+0] += a0 * b0 + buff[off+1] += a1 * b0 + buff[off+2] += a2*b0 + a0*b2 + buff[off+3] += a3*b0 + a1*b2 + buff[off+4] += a2 * b2 + buff[off+5] += a3 * b2 + } + } + + for i := uint(1); i < 4; i++ { + shift := 16 * i + + var head, carry uint64 + for j := uint(0); j < 4; j++ { + block := 4 * j + + xi := buff[block] + yi := (buff[block+i] << shift) + head + zi := xi + yi + carry + buff[block] = zi + carry = (xi&yi | (xi|yi)&^zi) >> 63 + + head = buff[block+i] >> (64 - shift) + } + } + + return [4]uint64{buff[0], buff[4], buff[8], buff[12]} +} + +func gfpMul(c, a, b *gfP) { + T := mul(*a, *b) + m := halfMul([4]uint64{T[0], T[1], T[2], T[3]}, np) + t := mul([4]uint64{m[0], m[1], m[2], m[3]}, p2) + + var carry uint64 + for i, Ti := range T { + ti := t[i] + zi := Ti + ti + carry + T[i] = zi + carry = (Ti&ti | (Ti|ti)&^zi) >> 63 + } + + *c = gfP{T[4], T[5], T[6], T[7]} + gfpCarry(c, carry) +} diff --git a/crypto/bn256/cloudflare/gfp_pure.go b/crypto/bn256/cloudflare/gfp_pure.go deleted file mode 100644 index 8fa5d3053..000000000 --- a/crypto/bn256/cloudflare/gfp_pure.go +++ /dev/null @@ -1,19 +0,0 @@ -// +build !amd64 appengine gccgo - -package bn256 - -func gfpNeg(c, a *gfP) { - panic("unsupported architecture") -} - -func gfpAdd(c, a, b *gfP) { - panic("unsupported architecture") -} - -func gfpSub(c, a, b *gfP) { - panic("unsupported architecture") -} - -func gfpMul(c, a, b *gfP) { - panic("unsupported architecture") -} diff --git a/crypto/bn256/cloudflare/gfp_test.go b/crypto/bn256/cloudflare/gfp_test.go index aff5e0531..16ab2a841 100644 --- a/crypto/bn256/cloudflare/gfp_test.go +++ b/crypto/bn256/cloudflare/gfp_test.go @@ -1,5 +1,3 @@ -// +build amd64,!appengine,!gccgo - package bn256 import ( diff --git a/crypto/bn256/cloudflare/lattice.go b/crypto/bn256/cloudflare/lattice.go new file mode 100644 index 000000000..f9ace4d9f --- /dev/null +++ b/crypto/bn256/cloudflare/lattice.go @@ -0,0 +1,115 @@ +package bn256 + +import ( + "math/big" +) + +var half = new(big.Int).Rsh(Order, 1) + +var curveLattice = &lattice{ + vectors: [][]*big.Int{ + {bigFromBase10("147946756881789319000765030803803410728"), bigFromBase10("147946756881789319010696353538189108491")}, + {bigFromBase10("147946756881789319020627676272574806254"), bigFromBase10("-147946756881789318990833708069417712965")}, + }, + inverse: []*big.Int{ + bigFromBase10("147946756881789318990833708069417712965"), + bigFromBase10("147946756881789319010696353538189108491"), + }, + det: bigFromBase10("43776485743678550444492811490514550177096728800832068687396408373151616991234"), +} + +var targetLattice = &lattice{ + vectors: [][]*big.Int{ + {bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697761"), bigFromBase10("9931322734385697763"), bigFromBase10("9931322734385697764")}, + {bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("4965661367192848882"), bigFromBase10("-9931322734385697762")}, + {bigFromBase10("-9931322734385697762"), bigFromBase10("-4965661367192848881"), bigFromBase10("4965661367192848881"), bigFromBase10("-4965661367192848882")}, + {bigFromBase10("9931322734385697763"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881"), bigFromBase10("-4965661367192848881")}, + }, + inverse: []*big.Int{ + bigFromBase10("734653495049373973658254490726798021314063399421879442165"), + bigFromBase10("147946756881789319000765030803803410728"), + bigFromBase10("-147946756881789319005730692170996259609"), + bigFromBase10("1469306990098747947464455738335385361643788813749140841702"), + }, + det: new(big.Int).Set(Order), +} + +type lattice struct { + vectors [][]*big.Int + inverse []*big.Int + det *big.Int +} + +// decompose takes a scalar mod Order as input and finds a short, positive decomposition of it wrt to the lattice basis. +func (l *lattice) decompose(k *big.Int) []*big.Int { + n := len(l.inverse) + + // Calculate closest vector in lattice to with Babai's rounding. + c := make([]*big.Int, n) + for i := 0; i < n; i++ { + c[i] = new(big.Int).Mul(k, l.inverse[i]) + round(c[i], l.det) + } + + // Transform vectors according to c and subtract . + out := make([]*big.Int, n) + temp := new(big.Int) + + for i := 0; i < n; i++ { + out[i] = new(big.Int) + + for j := 0; j < n; j++ { + temp.Mul(c[j], l.vectors[j][i]) + out[i].Add(out[i], temp) + } + + out[i].Neg(out[i]) + out[i].Add(out[i], l.vectors[0][i]).Add(out[i], l.vectors[0][i]) + } + out[0].Add(out[0], k) + + return out +} + +func (l *lattice) Precompute(add func(i, j uint)) { + n := uint(len(l.vectors)) + total := uint(1) << n + + for i := uint(0); i < n; i++ { + for j := uint(0); j < total; j++ { + if (j>>i)&1 == 1 { + add(i, j) + } + } + } +} + +func (l *lattice) Multi(scalar *big.Int) []uint8 { + decomp := l.decompose(scalar) + + maxLen := 0 + for _, x := range decomp { + if x.BitLen() > maxLen { + maxLen = x.BitLen() + } + } + + out := make([]uint8, maxLen) + for j, x := range decomp { + for i := 0; i < maxLen; i++ { + out[i] += uint8(x.Bit(i)) << uint(j) + } + } + + return out +} + +// round sets num to num/denom rounded to the nearest integer. +func round(num, denom *big.Int) { + r := new(big.Int) + num.DivMod(num, denom, r) + + if r.Cmp(half) == 1 { + num.Add(num, big.NewInt(1)) + } +} diff --git a/crypto/bn256/cloudflare/lattice_test.go b/crypto/bn256/cloudflare/lattice_test.go new file mode 100644 index 000000000..4d52ad9b2 --- /dev/null +++ b/crypto/bn256/cloudflare/lattice_test.go @@ -0,0 +1,29 @@ +package bn256 + +import ( + "crypto/rand" + + "testing" +) + +func TestLatticeReduceCurve(t *testing.T) { + k, _ := rand.Int(rand.Reader, Order) + ks := curveLattice.decompose(k) + + if ks[0].BitLen() > 130 || ks[1].BitLen() > 130 { + t.Fatal("reduction too large") + } else if ks[0].Sign() < 0 || ks[1].Sign() < 0 { + t.Fatal("reduction must be positive") + } +} + +func TestLatticeReduceTarget(t *testing.T) { + k, _ := rand.Int(rand.Reader, Order) + ks := targetLattice.decompose(k) + + if ks[0].BitLen() > 66 || ks[1].BitLen() > 66 || ks[2].BitLen() > 66 || ks[3].BitLen() > 66 { + t.Fatal("reduction too large") + } else if ks[0].Sign() < 0 || ks[1].Sign() < 0 || ks[2].Sign() < 0 || ks[3].Sign() < 0 { + t.Fatal("reduction must be positive") + } +} diff --git a/crypto/bn256/cloudflare/main_test.go b/crypto/bn256/cloudflare/main_test.go index f0d59a404..0230f1b19 100644 --- a/crypto/bn256/cloudflare/main_test.go +++ b/crypto/bn256/cloudflare/main_test.go @@ -1,5 +1,3 @@ -// +build amd64,!appengine,!gccgo - package bn256 import ( diff --git a/crypto/bn256/cloudflare/mul.h b/crypto/bn256/cloudflare/mul.h deleted file mode 100644 index bab5da831..000000000 --- a/crypto/bn256/cloudflare/mul.h +++ /dev/null @@ -1,181 +0,0 @@ -#define mul(a0,a1,a2,a3, rb, stack) \ - MOVQ a0, AX \ - MULQ 0+rb \ - MOVQ AX, R8 \ - MOVQ DX, R9 \ - MOVQ a0, AX \ - MULQ 8+rb \ - ADDQ AX, R9 \ - ADCQ $0, DX \ - MOVQ DX, R10 \ - MOVQ a0, AX \ - MULQ 16+rb \ - ADDQ AX, R10 \ - ADCQ $0, DX \ - MOVQ DX, R11 \ - MOVQ a0, AX \ - MULQ 24+rb \ - ADDQ AX, R11 \ - ADCQ $0, DX \ - MOVQ DX, R12 \ - \ - storeBlock(R8,R9,R10,R11, 0+stack) \ - MOVQ R12, 32+stack \ - \ - MOVQ a1, AX \ - MULQ 0+rb \ - MOVQ AX, R8 \ - MOVQ DX, R9 \ - MOVQ a1, AX \ - MULQ 8+rb \ - ADDQ AX, R9 \ - ADCQ $0, DX \ - MOVQ DX, R10 \ - MOVQ a1, AX \ - MULQ 16+rb \ - ADDQ AX, R10 \ - ADCQ $0, DX \ - MOVQ DX, R11 \ - MOVQ a1, AX \ - MULQ 24+rb \ - ADDQ AX, R11 \ - ADCQ $0, DX \ - MOVQ DX, R12 \ - \ - ADDQ 8+stack, R8 \ - ADCQ 16+stack, R9 \ - ADCQ 24+stack, R10 \ - ADCQ 32+stack, R11 \ - ADCQ $0, R12 \ - storeBlock(R8,R9,R10,R11, 8+stack) \ - MOVQ R12, 40+stack \ - \ - MOVQ a2, AX \ - MULQ 0+rb \ - MOVQ AX, R8 \ - MOVQ DX, R9 \ - MOVQ a2, AX \ - MULQ 8+rb \ - ADDQ AX, R9 \ - ADCQ $0, DX \ - MOVQ DX, R10 \ - MOVQ a2, AX \ - MULQ 16+rb \ - ADDQ AX, R10 \ - ADCQ $0, DX \ - MOVQ DX, R11 \ - MOVQ a2, AX \ - MULQ 24+rb \ - ADDQ AX, R11 \ - ADCQ $0, DX \ - MOVQ DX, R12 \ - \ - ADDQ 16+stack, R8 \ - ADCQ 24+stack, R9 \ - ADCQ 32+stack, R10 \ - ADCQ 40+stack, R11 \ - ADCQ $0, R12 \ - storeBlock(R8,R9,R10,R11, 16+stack) \ - MOVQ R12, 48+stack \ - \ - MOVQ a3, AX \ - MULQ 0+rb \ - MOVQ AX, R8 \ - MOVQ DX, R9 \ - MOVQ a3, AX \ - MULQ 8+rb \ - ADDQ AX, R9 \ - ADCQ $0, DX \ - MOVQ DX, R10 \ - MOVQ a3, AX \ - MULQ 16+rb \ - ADDQ AX, R10 \ - ADCQ $0, DX \ - MOVQ DX, R11 \ - MOVQ a3, AX \ - MULQ 24+rb \ - ADDQ AX, R11 \ - ADCQ $0, DX \ - MOVQ DX, R12 \ - \ - ADDQ 24+stack, R8 \ - ADCQ 32+stack, R9 \ - ADCQ 40+stack, R10 \ - ADCQ 48+stack, R11 \ - ADCQ $0, R12 \ - storeBlock(R8,R9,R10,R11, 24+stack) \ - MOVQ R12, 56+stack - -#define gfpReduce(stack) \ - \ // m = (T * N') mod R, store m in R8:R9:R10:R11 - MOVQ ·np+0(SB), AX \ - MULQ 0+stack \ - MOVQ AX, R8 \ - MOVQ DX, R9 \ - MOVQ ·np+0(SB), AX \ - MULQ 8+stack \ - ADDQ AX, R9 \ - ADCQ $0, DX \ - MOVQ DX, R10 \ - MOVQ ·np+0(SB), AX \ - MULQ 16+stack \ - ADDQ AX, R10 \ - ADCQ $0, DX \ - MOVQ DX, R11 \ - MOVQ ·np+0(SB), AX \ - MULQ 24+stack \ - ADDQ AX, R11 \ - \ - MOVQ ·np+8(SB), AX \ - MULQ 0+stack \ - MOVQ AX, R12 \ - MOVQ DX, R13 \ - MOVQ ·np+8(SB), AX \ - MULQ 8+stack \ - ADDQ AX, R13 \ - ADCQ $0, DX \ - MOVQ DX, R14 \ - MOVQ ·np+8(SB), AX \ - MULQ 16+stack \ - ADDQ AX, R14 \ - \ - ADDQ R12, R9 \ - ADCQ R13, R10 \ - ADCQ R14, R11 \ - \ - MOVQ ·np+16(SB), AX \ - MULQ 0+stack \ - MOVQ AX, R12 \ - MOVQ DX, R13 \ - MOVQ ·np+16(SB), AX \ - MULQ 8+stack \ - ADDQ AX, R13 \ - \ - ADDQ R12, R10 \ - ADCQ R13, R11 \ - \ - MOVQ ·np+24(SB), AX \ - MULQ 0+stack \ - ADDQ AX, R11 \ - \ - storeBlock(R8,R9,R10,R11, 64+stack) \ - \ - \ // m * N - mul(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64+stack, 96+stack) \ - \ - \ // Add the 512-bit intermediate to m*N - loadBlock(96+stack, R8,R9,R10,R11) \ - loadBlock(128+stack, R12,R13,R14,R15) \ - \ - MOVQ $0, AX \ - ADDQ 0+stack, R8 \ - ADCQ 8+stack, R9 \ - ADCQ 16+stack, R10 \ - ADCQ 24+stack, R11 \ - ADCQ 32+stack, R12 \ - ADCQ 40+stack, R13 \ - ADCQ 48+stack, R14 \ - ADCQ 56+stack, R15 \ - ADCQ $0, AX \ - \ - gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX) diff --git a/crypto/bn256/cloudflare/mul_amd64.h b/crypto/bn256/cloudflare/mul_amd64.h new file mode 100644 index 000000000..bab5da831 --- /dev/null +++ b/crypto/bn256/cloudflare/mul_amd64.h @@ -0,0 +1,181 @@ +#define mul(a0,a1,a2,a3, rb, stack) \ + MOVQ a0, AX \ + MULQ 0+rb \ + MOVQ AX, R8 \ + MOVQ DX, R9 \ + MOVQ a0, AX \ + MULQ 8+rb \ + ADDQ AX, R9 \ + ADCQ $0, DX \ + MOVQ DX, R10 \ + MOVQ a0, AX \ + MULQ 16+rb \ + ADDQ AX, R10 \ + ADCQ $0, DX \ + MOVQ DX, R11 \ + MOVQ a0, AX \ + MULQ 24+rb \ + ADDQ AX, R11 \ + ADCQ $0, DX \ + MOVQ DX, R12 \ + \ + storeBlock(R8,R9,R10,R11, 0+stack) \ + MOVQ R12, 32+stack \ + \ + MOVQ a1, AX \ + MULQ 0+rb \ + MOVQ AX, R8 \ + MOVQ DX, R9 \ + MOVQ a1, AX \ + MULQ 8+rb \ + ADDQ AX, R9 \ + ADCQ $0, DX \ + MOVQ DX, R10 \ + MOVQ a1, AX \ + MULQ 16+rb \ + ADDQ AX, R10 \ + ADCQ $0, DX \ + MOVQ DX, R11 \ + MOVQ a1, AX \ + MULQ 24+rb \ + ADDQ AX, R11 \ + ADCQ $0, DX \ + MOVQ DX, R12 \ + \ + ADDQ 8+stack, R8 \ + ADCQ 16+stack, R9 \ + ADCQ 24+stack, R10 \ + ADCQ 32+stack, R11 \ + ADCQ $0, R12 \ + storeBlock(R8,R9,R10,R11, 8+stack) \ + MOVQ R12, 40+stack \ + \ + MOVQ a2, AX \ + MULQ 0+rb \ + MOVQ AX, R8 \ + MOVQ DX, R9 \ + MOVQ a2, AX \ + MULQ 8+rb \ + ADDQ AX, R9 \ + ADCQ $0, DX \ + MOVQ DX, R10 \ + MOVQ a2, AX \ + MULQ 16+rb \ + ADDQ AX, R10 \ + ADCQ $0, DX \ + MOVQ DX, R11 \ + MOVQ a2, AX \ + MULQ 24+rb \ + ADDQ AX, R11 \ + ADCQ $0, DX \ + MOVQ DX, R12 \ + \ + ADDQ 16+stack, R8 \ + ADCQ 24+stack, R9 \ + ADCQ 32+stack, R10 \ + ADCQ 40+stack, R11 \ + ADCQ $0, R12 \ + storeBlock(R8,R9,R10,R11, 16+stack) \ + MOVQ R12, 48+stack \ + \ + MOVQ a3, AX \ + MULQ 0+rb \ + MOVQ AX, R8 \ + MOVQ DX, R9 \ + MOVQ a3, AX \ + MULQ 8+rb \ + ADDQ AX, R9 \ + ADCQ $0, DX \ + MOVQ DX, R10 \ + MOVQ a3, AX \ + MULQ 16+rb \ + ADDQ AX, R10 \ + ADCQ $0, DX \ + MOVQ DX, R11 \ + MOVQ a3, AX \ + MULQ 24+rb \ + ADDQ AX, R11 \ + ADCQ $0, DX \ + MOVQ DX, R12 \ + \ + ADDQ 24+stack, R8 \ + ADCQ 32+stack, R9 \ + ADCQ 40+stack, R10 \ + ADCQ 48+stack, R11 \ + ADCQ $0, R12 \ + storeBlock(R8,R9,R10,R11, 24+stack) \ + MOVQ R12, 56+stack + +#define gfpReduce(stack) \ + \ // m = (T * N') mod R, store m in R8:R9:R10:R11 + MOVQ ·np+0(SB), AX \ + MULQ 0+stack \ + MOVQ AX, R8 \ + MOVQ DX, R9 \ + MOVQ ·np+0(SB), AX \ + MULQ 8+stack \ + ADDQ AX, R9 \ + ADCQ $0, DX \ + MOVQ DX, R10 \ + MOVQ ·np+0(SB), AX \ + MULQ 16+stack \ + ADDQ AX, R10 \ + ADCQ $0, DX \ + MOVQ DX, R11 \ + MOVQ ·np+0(SB), AX \ + MULQ 24+stack \ + ADDQ AX, R11 \ + \ + MOVQ ·np+8(SB), AX \ + MULQ 0+stack \ + MOVQ AX, R12 \ + MOVQ DX, R13 \ + MOVQ ·np+8(SB), AX \ + MULQ 8+stack \ + ADDQ AX, R13 \ + ADCQ $0, DX \ + MOVQ DX, R14 \ + MOVQ ·np+8(SB), AX \ + MULQ 16+stack \ + ADDQ AX, R14 \ + \ + ADDQ R12, R9 \ + ADCQ R13, R10 \ + ADCQ R14, R11 \ + \ + MOVQ ·np+16(SB), AX \ + MULQ 0+stack \ + MOVQ AX, R12 \ + MOVQ DX, R13 \ + MOVQ ·np+16(SB), AX \ + MULQ 8+stack \ + ADDQ AX, R13 \ + \ + ADDQ R12, R10 \ + ADCQ R13, R11 \ + \ + MOVQ ·np+24(SB), AX \ + MULQ 0+stack \ + ADDQ AX, R11 \ + \ + storeBlock(R8,R9,R10,R11, 64+stack) \ + \ + \ // m * N + mul(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64+stack, 96+stack) \ + \ + \ // Add the 512-bit intermediate to m*N + loadBlock(96+stack, R8,R9,R10,R11) \ + loadBlock(128+stack, R12,R13,R14,R15) \ + \ + MOVQ $0, AX \ + ADDQ 0+stack, R8 \ + ADCQ 8+stack, R9 \ + ADCQ 16+stack, R10 \ + ADCQ 24+stack, R11 \ + ADCQ 32+stack, R12 \ + ADCQ 40+stack, R13 \ + ADCQ 48+stack, R14 \ + ADCQ 56+stack, R15 \ + ADCQ $0, AX \ + \ + gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX) diff --git a/crypto/bn256/cloudflare/mul_arm64.h b/crypto/bn256/cloudflare/mul_arm64.h new file mode 100644 index 000000000..75d522173 --- /dev/null +++ b/crypto/bn256/cloudflare/mul_arm64.h @@ -0,0 +1,133 @@ +#define mul(c0,c1,c2,c3,c4,c5,c6,c7) \ + MUL R1, R5, c0 \ + UMULH R1, R5, c1 \ + MUL R1, R6, R0 \ + ADDS R0, c1 \ + UMULH R1, R6, c2 \ + MUL R1, R7, R0 \ + ADCS R0, c2 \ + UMULH R1, R7, c3 \ + MUL R1, R8, R0 \ + ADCS R0, c3 \ + UMULH R1, R8, c4 \ + ADCS ZR, c4 \ + \ + MUL R2, R5, R25 \ + UMULH R2, R5, R26 \ + MUL R2, R6, R0 \ + ADDS R0, R26 \ + UMULH R2, R6, R27 \ + MUL R2, R7, R0 \ + ADCS R0, R27 \ + UMULH R2, R7, R29 \ + MUL R2, R8, R0 \ + ADCS R0, R29 \ + UMULH R2, R8, c5 \ + ADCS ZR, c5 \ + ADDS R25, c1 \ + ADCS R26, c2 \ + ADCS R27, c3 \ + ADCS R29, c4 \ + ADCS ZR, c5 \ + \ + MUL R3, R5, R25 \ + UMULH R3, R5, R26 \ + MUL R3, R6, R0 \ + ADDS R0, R26 \ + UMULH R3, R6, R27 \ + MUL R3, R7, R0 \ + ADCS R0, R27 \ + UMULH R3, R7, R29 \ + MUL R3, R8, R0 \ + ADCS R0, R29 \ + UMULH R3, R8, c6 \ + ADCS ZR, c6 \ + ADDS R25, c2 \ + ADCS R26, c3 \ + ADCS R27, c4 \ + ADCS R29, c5 \ + ADCS ZR, c6 \ + \ + MUL R4, R5, R25 \ + UMULH R4, R5, R26 \ + MUL R4, R6, R0 \ + ADDS R0, R26 \ + UMULH R4, R6, R27 \ + MUL R4, R7, R0 \ + ADCS R0, R27 \ + UMULH R4, R7, R29 \ + MUL R4, R8, R0 \ + ADCS R0, R29 \ + UMULH R4, R8, c7 \ + ADCS ZR, c7 \ + ADDS R25, c3 \ + ADCS R26, c4 \ + ADCS R27, c5 \ + ADCS R29, c6 \ + ADCS ZR, c7 + +#define gfpReduce() \ + \ // m = (T * N') mod R, store m in R1:R2:R3:R4 + MOVD ·np+0(SB), R17 \ + MOVD ·np+8(SB), R18 \ + MOVD ·np+16(SB), R19 \ + MOVD ·np+24(SB), R20 \ + \ + MUL R9, R17, R1 \ + UMULH R9, R17, R2 \ + MUL R9, R18, R0 \ + ADDS R0, R2 \ + UMULH R9, R18, R3 \ + MUL R9, R19, R0 \ + ADCS R0, R3 \ + UMULH R9, R19, R4 \ + MUL R9, R20, R0 \ + ADCS R0, R4 \ + \ + MUL R10, R17, R21 \ + UMULH R10, R17, R22 \ + MUL R10, R18, R0 \ + ADDS R0, R22 \ + UMULH R10, R18, R23 \ + MUL R10, R19, R0 \ + ADCS R0, R23 \ + ADDS R21, R2 \ + ADCS R22, R3 \ + ADCS R23, R4 \ + \ + MUL R11, R17, R21 \ + UMULH R11, R17, R22 \ + MUL R11, R18, R0 \ + ADDS R0, R22 \ + ADDS R21, R3 \ + ADCS R22, R4 \ + \ + MUL R12, R17, R21 \ + ADDS R21, R4 \ + \ + \ // m * N + loadModulus(R5,R6,R7,R8) \ + mul(R17,R18,R19,R20,R21,R22,R23,R24) \ + \ + \ // Add the 512-bit intermediate to m*N + MOVD ZR, R25 \ + ADDS R9, R17 \ + ADCS R10, R18 \ + ADCS R11, R19 \ + ADCS R12, R20 \ + ADCS R13, R21 \ + ADCS R14, R22 \ + ADCS R15, R23 \ + ADCS R16, R24 \ + ADCS ZR, R25 \ + \ + \ // Our output is R21:R22:R23:R24. Reduce mod p if necessary. + SUBS R5, R21, R10 \ + SBCS R6, R22, R11 \ + SBCS R7, R23, R12 \ + SBCS R8, R24, R13 \ + \ + CSEL CS, R10, R21, R1 \ + CSEL CS, R11, R22, R2 \ + CSEL CS, R12, R23, R3 \ + CSEL CS, R13, R24, R4 diff --git a/crypto/bn256/cloudflare/mul_bmi2.h b/crypto/bn256/cloudflare/mul_bmi2.h deleted file mode 100644 index 71ad0499a..000000000 --- a/crypto/bn256/cloudflare/mul_bmi2.h +++ /dev/null @@ -1,112 +0,0 @@ -#define mulBMI2(a0,a1,a2,a3, rb) \ - MOVQ a0, DX \ - MOVQ $0, R13 \ - MULXQ 0+rb, R8, R9 \ - MULXQ 8+rb, AX, R10 \ - ADDQ AX, R9 \ - MULXQ 16+rb, AX, R11 \ - ADCQ AX, R10 \ - MULXQ 24+rb, AX, R12 \ - ADCQ AX, R11 \ - ADCQ $0, R12 \ - ADCQ $0, R13 \ - \ - MOVQ a1, DX \ - MOVQ $0, R14 \ - MULXQ 0+rb, AX, BX \ - ADDQ AX, R9 \ - ADCQ BX, R10 \ - MULXQ 16+rb, AX, BX \ - ADCQ AX, R11 \ - ADCQ BX, R12 \ - ADCQ $0, R13 \ - MULXQ 8+rb, AX, BX \ - ADDQ AX, R10 \ - ADCQ BX, R11 \ - MULXQ 24+rb, AX, BX \ - ADCQ AX, R12 \ - ADCQ BX, R13 \ - ADCQ $0, R14 \ - \ - MOVQ a2, DX \ - MOVQ $0, R15 \ - MULXQ 0+rb, AX, BX \ - ADDQ AX, R10 \ - ADCQ BX, R11 \ - MULXQ 16+rb, AX, BX \ - ADCQ AX, R12 \ - ADCQ BX, R13 \ - ADCQ $0, R14 \ - MULXQ 8+rb, AX, BX \ - ADDQ AX, R11 \ - ADCQ BX, R12 \ - MULXQ 24+rb, AX, BX \ - ADCQ AX, R13 \ - ADCQ BX, R14 \ - ADCQ $0, R15 \ - \ - MOVQ a3, DX \ - MULXQ 0+rb, AX, BX \ - ADDQ AX, R11 \ - ADCQ BX, R12 \ - MULXQ 16+rb, AX, BX \ - ADCQ AX, R13 \ - ADCQ BX, R14 \ - ADCQ $0, R15 \ - MULXQ 8+rb, AX, BX \ - ADDQ AX, R12 \ - ADCQ BX, R13 \ - MULXQ 24+rb, AX, BX \ - ADCQ AX, R14 \ - ADCQ BX, R15 - -#define gfpReduceBMI2() \ - \ // m = (T * N') mod R, store m in R8:R9:R10:R11 - MOVQ ·np+0(SB), DX \ - MULXQ 0(SP), R8, R9 \ - MULXQ 8(SP), AX, R10 \ - ADDQ AX, R9 \ - MULXQ 16(SP), AX, R11 \ - ADCQ AX, R10 \ - MULXQ 24(SP), AX, BX \ - ADCQ AX, R11 \ - \ - MOVQ ·np+8(SB), DX \ - MULXQ 0(SP), AX, BX \ - ADDQ AX, R9 \ - ADCQ BX, R10 \ - MULXQ 16(SP), AX, BX \ - ADCQ AX, R11 \ - MULXQ 8(SP), AX, BX \ - ADDQ AX, R10 \ - ADCQ BX, R11 \ - \ - MOVQ ·np+16(SB), DX \ - MULXQ 0(SP), AX, BX \ - ADDQ AX, R10 \ - ADCQ BX, R11 \ - MULXQ 8(SP), AX, BX \ - ADDQ AX, R11 \ - \ - MOVQ ·np+24(SB), DX \ - MULXQ 0(SP), AX, BX \ - ADDQ AX, R11 \ - \ - storeBlock(R8,R9,R10,R11, 64(SP)) \ - \ - \ // m * N - mulBMI2(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \ - \ - \ // Add the 512-bit intermediate to m*N - MOVQ $0, AX \ - ADDQ 0(SP), R8 \ - ADCQ 8(SP), R9 \ - ADCQ 16(SP), R10 \ - ADCQ 24(SP), R11 \ - ADCQ 32(SP), R12 \ - ADCQ 40(SP), R13 \ - ADCQ 48(SP), R14 \ - ADCQ 56(SP), R15 \ - ADCQ $0, AX \ - \ - gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX) diff --git a/crypto/bn256/cloudflare/mul_bmi2_amd64.h b/crypto/bn256/cloudflare/mul_bmi2_amd64.h new file mode 100644 index 000000000..71ad0499a --- /dev/null +++ b/crypto/bn256/cloudflare/mul_bmi2_amd64.h @@ -0,0 +1,112 @@ +#define mulBMI2(a0,a1,a2,a3, rb) \ + MOVQ a0, DX \ + MOVQ $0, R13 \ + MULXQ 0+rb, R8, R9 \ + MULXQ 8+rb, AX, R10 \ + ADDQ AX, R9 \ + MULXQ 16+rb, AX, R11 \ + ADCQ AX, R10 \ + MULXQ 24+rb, AX, R12 \ + ADCQ AX, R11 \ + ADCQ $0, R12 \ + ADCQ $0, R13 \ + \ + MOVQ a1, DX \ + MOVQ $0, R14 \ + MULXQ 0+rb, AX, BX \ + ADDQ AX, R9 \ + ADCQ BX, R10 \ + MULXQ 16+rb, AX, BX \ + ADCQ AX, R11 \ + ADCQ BX, R12 \ + ADCQ $0, R13 \ + MULXQ 8+rb, AX, BX \ + ADDQ AX, R10 \ + ADCQ BX, R11 \ + MULXQ 24+rb, AX, BX \ + ADCQ AX, R12 \ + ADCQ BX, R13 \ + ADCQ $0, R14 \ + \ + MOVQ a2, DX \ + MOVQ $0, R15 \ + MULXQ 0+rb, AX, BX \ + ADDQ AX, R10 \ + ADCQ BX, R11 \ + MULXQ 16+rb, AX, BX \ + ADCQ AX, R12 \ + ADCQ BX, R13 \ + ADCQ $0, R14 \ + MULXQ 8+rb, AX, BX \ + ADDQ AX, R11 \ + ADCQ BX, R12 \ + MULXQ 24+rb, AX, BX \ + ADCQ AX, R13 \ + ADCQ BX, R14 \ + ADCQ $0, R15 \ + \ + MOVQ a3, DX \ + MULXQ 0+rb, AX, BX \ + ADDQ AX, R11 \ + ADCQ BX, R12 \ + MULXQ 16+rb, AX, BX \ + ADCQ AX, R13 \ + ADCQ BX, R14 \ + ADCQ $0, R15 \ + MULXQ 8+rb, AX, BX \ + ADDQ AX, R12 \ + ADCQ BX, R13 \ + MULXQ 24+rb, AX, BX \ + ADCQ AX, R14 \ + ADCQ BX, R15 + +#define gfpReduceBMI2() \ + \ // m = (T * N') mod R, store m in R8:R9:R10:R11 + MOVQ ·np+0(SB), DX \ + MULXQ 0(SP), R8, R9 \ + MULXQ 8(SP), AX, R10 \ + ADDQ AX, R9 \ + MULXQ 16(SP), AX, R11 \ + ADCQ AX, R10 \ + MULXQ 24(SP), AX, BX \ + ADCQ AX, R11 \ + \ + MOVQ ·np+8(SB), DX \ + MULXQ 0(SP), AX, BX \ + ADDQ AX, R9 \ + ADCQ BX, R10 \ + MULXQ 16(SP), AX, BX \ + ADCQ AX, R11 \ + MULXQ 8(SP), AX, BX \ + ADDQ AX, R10 \ + ADCQ BX, R11 \ + \ + MOVQ ·np+16(SB), DX \ + MULXQ 0(SP), AX, BX \ + ADDQ AX, R10 \ + ADCQ BX, R11 \ + MULXQ 8(SP), AX, BX \ + ADDQ AX, R11 \ + \ + MOVQ ·np+24(SB), DX \ + MULXQ 0(SP), AX, BX \ + ADDQ AX, R11 \ + \ + storeBlock(R8,R9,R10,R11, 64(SP)) \ + \ + \ // m * N + mulBMI2(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \ + \ + \ // Add the 512-bit intermediate to m*N + MOVQ $0, AX \ + ADDQ 0(SP), R8 \ + ADCQ 8(SP), R9 \ + ADCQ 16(SP), R10 \ + ADCQ 24(SP), R11 \ + ADCQ 32(SP), R12 \ + ADCQ 40(SP), R13 \ + ADCQ 48(SP), R14 \ + ADCQ 56(SP), R15 \ + ADCQ $0, AX \ + \ + gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX) -- cgit v1.2.3