diff options
Diffstat (limited to 'crypto/bn256/cloudflare/gfp_amd64.s')
-rw-r--r-- | crypto/bn256/cloudflare/gfp_amd64.s | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/crypto/bn256/cloudflare/gfp_amd64.s b/crypto/bn256/cloudflare/gfp_amd64.s new file mode 100644 index 000000000..2d0176f2e --- /dev/null +++ b/crypto/bn256/cloudflare/gfp_amd64.s @@ -0,0 +1,97 @@ +// +build amd64,!appengine,!gccgo + +#include "gfp.h" +#include "mul.h" +#include "mul_bmi2.h" + +TEXT ·gfpNeg(SB),0,$0-16 + MOVQ ·p2+0(SB), R8 + MOVQ ·p2+8(SB), R9 + MOVQ ·p2+16(SB), R10 + MOVQ ·p2+24(SB), R11 + + MOVQ a+8(FP), DI + SUBQ 0(DI), R8 + SBBQ 8(DI), R9 + SBBQ 16(DI), R10 + SBBQ 24(DI), R11 + + MOVQ $0, AX + gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,R15,BX) + + MOVQ c+0(FP), DI + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpAdd(SB),0,$0-24 + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + + loadBlock(0(DI), R8,R9,R10,R11) + MOVQ $0, R12 + + ADDQ 0(SI), R8 + ADCQ 8(SI), R9 + ADCQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0, R12 + + gfpCarry(R8,R9,R10,R11,R12, R13,R14,R15,AX,BX) + + MOVQ c+0(FP), DI + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpSub(SB),0,$0-24 + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + + loadBlock(0(DI), R8,R9,R10,R11) + + MOVQ ·p2+0(SB), R12 + MOVQ ·p2+8(SB), R13 + MOVQ ·p2+16(SB), R14 + MOVQ ·p2+24(SB), R15 + MOVQ $0, AX + + SUBQ 0(SI), R8 + SBBQ 8(SI), R9 + SBBQ 16(SI), R10 + SBBQ 24(SI), R11 + + CMOVQCC AX, R12 + CMOVQCC AX, R13 + CMOVQCC AX, R14 + CMOVQCC AX, R15 + + ADDQ R12, R8 + ADCQ R13, R9 + ADCQ R14, R10 + ADCQ R15, R11 + + MOVQ c+0(FP), DI + storeBlock(R8,R9,R10,R11, 0(DI)) + RET + +TEXT ·gfpMul(SB),0,$160-24 + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + + // Jump to a slightly different implementation if MULX isn't supported. + CMPB runtime·support_bmi2(SB), $0 + JE nobmi2Mul + + mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI)) + storeBlock( R8, R9,R10,R11, 0(SP)) + storeBlock(R12,R13,R14,R15, 32(SP)) + gfpReduceBMI2() + JMP end + +nobmi2Mul: + mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP)) + gfpReduce(0(SP)) + +end: + MOVQ c+0(FP), DI + storeBlock(R12,R13,R14,R15, 0(DI)) + RET |