aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn256/cloudflare/mul_arm64.h
blob: 75d52217311b13cd6b4ec2fc3782eacee67f9c8b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
    MUL R1, R5, c0 \
    UMULH R1, R5, c1 \
    MUL R1, R6, R0 \
    ADDS R0, c1 \
    UMULH R1, R6, c2 \
    MUL R1, R7, R0 \
    ADCS R0, c2 \
    UMULH R1, R7, c3 \
    MUL R1, R8, R0 \
    ADCS R0, c3 \
    UMULH R1, R8, c4 \
    ADCS ZR, c4 \
    \
    MUL R2, R5, R25 \
    UMULH R2, R5, R26 \
    MUL R2, R6, R0 \
    ADDS R0, R26 \
    UMULH R2, R6, R27 \
    MUL R2, R7, R0 \
    ADCS R0, R27 \
    UMULH R2, R7, R29 \
    MUL R2, R8, R0 \
    ADCS R0, R29 \
    UMULH R2, R8, c5 \
    ADCS ZR, c5 \
    ADDS R25, c1 \
    ADCS R26, c2 \
    ADCS R27, c3 \
    ADCS R29, c4 \
    ADCS  ZR, c5 \
    \
    MUL R3, R5, R25 \
    UMULH R3, R5, R26 \
    MUL R3, R6, R0 \
    ADDS R0, R26 \
    UMULH R3, R6, R27 \
    MUL R3, R7, R0 \
    ADCS R0, R27 \
    UMULH R3, R7, R29 \
    MUL R3, R8, R0 \
    ADCS R0, R29 \
    UMULH R3, R8, c6 \
    ADCS ZR, c6 \
    ADDS R25, c2 \
    ADCS R26, c3 \
    ADCS R27, c4 \
    ADCS R29, c5 \
    ADCS  ZR, c6 \
    \
    MUL R4, R5, R25 \
    UMULH R4, R5, R26 \
    MUL R4, R6, R0 \
    ADDS R0, R26 \
    UMULH R4, R6, R27 \
    MUL R4, R7, R0 \
    ADCS R0, R27 \
    UMULH R4, R7, R29 \
    MUL R4, R8, R0 \
    ADCS R0, R29 \
    UMULH R4, R8, c7 \
    ADCS ZR, c7 \
    ADDS R25, c3 \
    ADCS R26, c4 \
    ADCS R27, c5 \
    ADCS R29, c6 \
    ADCS  ZR, c7

#define gfpReduce() \
    \ // m = (T * N') mod R, store m in R1:R2:R3:R4
    MOVD ·np+0(SB), R17 \
    MOVD ·np+8(SB), R18 \
    MOVD ·np+16(SB), R19 \
    MOVD ·np+24(SB), R20 \
    \
    MUL R9, R17, R1 \
    UMULH R9, R17, R2 \
    MUL R9, R18, R0 \
    ADDS R0, R2 \
    UMULH R9, R18, R3 \
    MUL R9, R19, R0 \
    ADCS R0, R3 \
    UMULH R9, R19, R4 \
    MUL R9, R20, R0 \
    ADCS R0, R4 \
    \
    MUL R10, R17, R21 \
    UMULH R10, R17, R22 \
    MUL R10, R18, R0 \
    ADDS R0, R22 \
    UMULH R10, R18, R23 \
    MUL R10, R19, R0 \
    ADCS R0, R23 \
    ADDS R21, R2 \
    ADCS R22, R3 \
    ADCS R23, R4 \
    \
    MUL R11, R17, R21 \
    UMULH R11, R17, R22 \
    MUL R11, R18, R0 \
    ADDS R0, R22 \
    ADDS R21, R3 \
    ADCS R22, R4 \
    \
    MUL R12, R17, R21 \
    ADDS R21, R4 \
    \
    \ // m * N
    loadModulus(R5,R6,R7,R8) \
    mul(R17,R18,R19,R20,R21,R22,R23,R24) \
    \
    \ // Add the 512-bit intermediate to m*N
    MOVD  ZR, R25 \
    ADDS  R9, R17 \
    ADCS R10, R18 \
    ADCS R11, R19 \
    ADCS R12, R20 \
    ADCS R13, R21 \
    ADCS R14, R22 \
    ADCS R15, R23 \
    ADCS R16, R24 \
    ADCS  ZR, R25 \
    \
    \ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
    SUBS R5, R21, R10 \
    SBCS R6, R22, R11 \
    SBCS R7, R23, R12 \
    SBCS R8, R24, R13 \
    \
    CSEL CS, R10, R21, R1 \
    CSEL CS, R11, R22, R2 \
    CSEL CS, R12, R23, R3 \
    CSEL CS, R13, R24, R4